No OneTemporary
Actions

Size

5 MB

Referenced Files

None

Subscribers

None

View Options

This file is larger than 256 KB, so syntax highlighting was skipped.

	diff --git a/CMakeLists.txt b/CMakeLists.txt
	index 6af2cba10093..8c0f51145139 100644
	--- a/CMakeLists.txt
	+++ b/CMakeLists.txt
	@@ -1,1017 +1,1017 @@
	# See docs/CMake.html for instructions about how to build LLVM with CMake.

	cmake_minimum_required(VERSION 3.4.3)

	if(POLICY CMP0022)
	cmake_policy(SET CMP0022 NEW) # automatic when 2.8.12 is required
	endif()

	if (POLICY CMP0051)
	# CMake 3.1 and higher include generator expressions of the form
	# $<TARGETLIB:obj> in the SOURCES property. These need to be
	# stripped everywhere that access the SOURCES property, so we just
	# defer to the OLD behavior of not including generator expressions
	# in the output for now.
	cmake_policy(SET CMP0051 OLD)
	endif()

	if(POLICY CMP0057)
	cmake_policy(SET CMP0057 NEW)
	endif()

	if(NOT DEFINED LLVM_VERSION_MAJOR)
	set(LLVM_VERSION_MAJOR 5)
	endif()
	if(NOT DEFINED LLVM_VERSION_MINOR)
	set(LLVM_VERSION_MINOR 0)
	endif()
	if(NOT DEFINED LLVM_VERSION_PATCH)
	set(LLVM_VERSION_PATCH 0)
	endif()
	if(NOT DEFINED LLVM_VERSION_SUFFIX)
	set(LLVM_VERSION_SUFFIX "")
	endif()

	if (POLICY CMP0048)
	cmake_policy(SET CMP0048 NEW)
	set(cmake_3_0_PROJ_VERSION
	VERSION ${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH})
	set(cmake_3_0_LANGUAGES LANGUAGES)
	endif()

	if (NOT PACKAGE_VERSION)
	set(PACKAGE_VERSION
	"${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH}${LLVM_VERSION_SUFFIX}")
	endif()

	if ((CMAKE_GENERATOR MATCHES "Visual Studio") AND (CMAKE_GENERATOR_TOOLSET STREQUAL ""))
	message(WARNING "Visual Studio generators use the x86 host compiler by "
	"default, even for 64-bit targets. This can result in linker "
	"instability and out of memory errors. To use the 64-bit "
	"host compiler, pass -Thost=x64 on the CMake command line.")
	endif()

	project(LLVM
	${cmake_3_0_PROJ_VERSION}
	${cmake_3_0_LANGUAGES}
	C CXX ASM)

	if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
	message(STATUS "No build type selected, default to Debug")
	set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type (default Debug)" FORCE)
	endif()

	# This should only apply if you are both on an Apple host, and targeting Apple.
	if(CMAKE_HOST_APPLE AND APPLE)
	# if CMAKE_LIBTOOL is not set, try and find it with xcrun or find_program
	if(NOT CMAKE_LIBTOOL)
	if(NOT CMAKE_XCRUN)
	find_program(CMAKE_XCRUN NAMES xcrun)
	endif()
	if(CMAKE_XCRUN)
	execute_process(COMMAND ${CMAKE_XCRUN} -find libtool
	OUTPUT_VARIABLE CMAKE_LIBTOOL
	OUTPUT_STRIP_TRAILING_WHITESPACE)
	endif()

	if(NOT CMAKE_LIBTOOL OR NOT EXISTS CMAKE_LIBTOOL)
	find_program(CMAKE_LIBTOOL NAMES libtool)
	endif()
	endif()

	get_property(languages GLOBAL PROPERTY ENABLED_LANGUAGES)
	if(CMAKE_LIBTOOL)
	set(CMAKE_LIBTOOL ${CMAKE_LIBTOOL} CACHE PATH "libtool executable")
	message(STATUS "Found libtool - ${CMAKE_LIBTOOL}")

	execute_process(COMMAND ${CMAKE_LIBTOOL} -V
	OUTPUT_VARIABLE LIBTOOL_V_OUTPUT
	OUTPUT_STRIP_TRAILING_WHITESPACE)
	if("${LIBTOOL_V_OUTPUT}" MATCHES ".cctools-([0-9.]+).")
	string(REGEX REPLACE ".cctools-([0-9.]+)." "\\1" LIBTOOL_VERSION
	${LIBTOOL_V_OUTPUT})
	if(NOT LIBTOOL_VERSION VERSION_LESS "862")
	set(LIBTOOL_NO_WARNING_FLAG "-no_warning_for_no_symbols")
	endif()
	endif()

	foreach(lang ${languages})
	set(CMAKE_${lang}_CREATE_STATIC_LIBRARY
	"${CMAKE_LIBTOOL} -static ${LIBTOOL_NO_WARNING_FLAG} -o <TARGET> \
	<LINK_FLAGS> <OBJECTS> ")
	endforeach()
	endif()

	# If DYLD_LIBRARY_PATH is set we need to set it on archiver commands
	if(DYLD_LIBRARY_PATH)
	set(dyld_envar "DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}")
	foreach(lang ${languages})
	foreach(cmd ${CMAKE_${lang}_CREATE_STATIC_LIBRARY})
	list(APPEND CMAKE_${lang}_CREATE_STATIC_LIBRARY_NEW
	"${dyld_envar} ${cmd}")
	endforeach()
	set(CMAKE_${lang}_CREATE_STATIC_LIBRARY
	${CMAKE_${lang}_CREATE_STATIC_LIBRARY_NEW})
	endforeach()
	endif()
	endif()

	# Side-by-side subprojects layout: automatically set the
	# LLVM_EXTERNAL_${project}_SOURCE_DIR using LLVM_ALL_PROJECTS
	# This allows an easy way of setting up a build directory for llvm and another
	# one for llvm+clang+... using the same sources.
	set(LLVM_ALL_PROJECTS "clang;libcxx;libcxxabi;lldb;compiler-rt;lld;polly")
	set(LLVM_ENABLE_PROJECTS "" CACHE STRING
	"Semicolon-separated list of projects to build (${LLVM_ALL_PROJECTS}), or \"all\".")
	if( LLVM_ENABLE_PROJECTS STREQUAL "all" )
	set( LLVM_ENABLE_PROJECTS ${LLVM_ALL_PROJECTS})
	endif()
	foreach(proj ${LLVM_ENABLE_PROJECTS})
	set(PROJ_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../${proj}")
	if(NOT EXISTS "${PROJ_DIR}" OR NOT IS_DIRECTORY "${PROJ_DIR}")
	message(FATAL_ERROR "LLVM_ENABLE_PROJECTS requests ${proj} but directory not found: ${PROJ_DIR}")
	endif()
	string(TOUPPER "${proj}" upper_proj)
	STRING(REGEX REPLACE "-" "_" upper_proj ${upper_proj})
	set(LLVM_EXTERNAL_${upper_proj}_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../${proj}")
	# There is a widely spread opinion that clang-tools-extra should be merged
	# into clang. The following simulates it by always enabling clang-tools-extra
	# when enabling clang.
	if (proj STREQUAL "clang")
	set(LLVM_EXTERNAL_CLANG_TOOLS_EXTRA_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../clang-tools-extra")
	endif()
	endforeach()

	# Build llvm with ccache if the package is present
	set(LLVM_CCACHE_BUILD OFF CACHE BOOL "Set to ON for a ccache enabled build")
	if(LLVM_CCACHE_BUILD)
	find_program(CCACHE_PROGRAM ccache)
	if(CCACHE_PROGRAM)
	set(LLVM_CCACHE_SIZE "" CACHE STRING "Size of ccache")
	set(LLVM_CCACHE_DIR "" CACHE STRING "Directory to keep ccached data")
	set(CCACHE_PROGRAM "CCACHE_CPP2=yes CCACHE_HASHDIR=yes ${CCACHE_PROGRAM}")
	if (LLVM_CCACHE_SIZE)
	set(CCACHE_PROGRAM "CCACHE_SIZE=${LLVM_CCACHE_SIZE} ${CCACHE_PROGRAM}")
	endif()
	if (LLVM_CCACHE_DIR)
	set(CCACHE_PROGRAM "CCACHE_DIR=${LLVM_CCACHE_DIR} ${CCACHE_PROGRAM}")
	endif()
	set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ${CCACHE_PROGRAM})
	else()
	message(FATAL_ERROR "Unable to find the program ccache. Set LLVM_CCACHE_BUILD to OFF")
	endif()
	endif()

	option(LLVM_DEPENDENCY_DEBUGGING "Dependency debugging mode to verify correctly expressed library dependencies (Darwin only)" OFF)

	# Some features of the LLVM build may be disallowed when dependency debugging is
	# enabled. In particular you cannot use ccache because we want to force compile
	# operations to always happen.
	if(LLVM_DEPENDENCY_DEBUGGING)
	if(NOT CMAKE_HOST_APPLE)
	message(FATAL_ERROR "Dependency debugging is only currently supported on Darwin hosts.")
	endif()
	if(LLVM_CCACHE_BUILD)
	message(FATAL_ERROR "Cannot enable dependency debugging while using ccache.")
	endif()
	endif()

	option(LLVM_BUILD_GLOBAL_ISEL "Experimental: Build GlobalISel" ON)
	if(LLVM_BUILD_GLOBAL_ISEL)
	add_definitions(-DLLVM_BUILD_GLOBAL_ISEL)
	endif()

	option(LLVM_ENABLE_DAGISEL_COV "Debug: Prints tablegen patterns that were used for selecting" OFF)

	# Add path for custom modules
	set(CMAKE_MODULE_PATH
	${CMAKE_MODULE_PATH}
	"${CMAKE_CURRENT_SOURCE_DIR}/cmake"
	"${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules"
	)

	# Generate a CompilationDatabase (compile_commands.json file) for our build,
	# for use by clang_complete, YouCompleteMe, etc.
	set(CMAKE_EXPORT_COMPILE_COMMANDS 1)

	option(LLVM_INSTALL_UTILS "Include utility binaries in the 'install' target." OFF)

	option(LLVM_INSTALL_TOOLCHAIN_ONLY "Only include toolchain files in the 'install' target." OFF)

	option(LLVM_USE_FOLDERS "Enable solution folders in Visual Studio. Disable for Express versions." ON)
	if ( LLVM_USE_FOLDERS )
	set_property(GLOBAL PROPERTY USE_FOLDERS ON)
	endif()

	include(VersionFromVCS)

	option(LLVM_APPEND_VC_REV
	"Embed the version control system revision id in LLVM" ON)

	if( LLVM_APPEND_VC_REV )
	add_version_info_from_vcs(PACKAGE_VERSION)
	endif()

	set(PACKAGE_NAME LLVM)
	set(PACKAGE_STRING "${PACKAGE_NAME} ${PACKAGE_VERSION}")
	set(PACKAGE_BUGREPORT "http://llvm.org/bugs/")

	set(BUG_REPORT_URL "${PACKAGE_BUGREPORT}" CACHE STRING
	"Default URL where bug reports are to be submitted.")

	# Configure CPack.
	set(CPACK_PACKAGE_INSTALL_DIRECTORY "LLVM")
	set(CPACK_PACKAGE_VENDOR "LLVM")
	set(CPACK_PACKAGE_VERSION_MAJOR ${LLVM_VERSION_MAJOR})
	set(CPACK_PACKAGE_VERSION_MINOR ${LLVM_VERSION_MINOR})
	set(CPACK_PACKAGE_VERSION_PATCH ${LLVM_VERSION_PATCH})
	set(CPACK_PACKAGE_VERSION ${PACKAGE_VERSION})
	set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.TXT")
	set(CPACK_NSIS_COMPRESSOR "/SOLID lzma \r\n SetCompressorDictSize 32")
	if(WIN32 AND NOT UNIX)
	set(CPACK_PACKAGE_INSTALL_REGISTRY_KEY "LLVM")
	set(CPACK_PACKAGE_ICON "${CMAKE_CURRENT_SOURCE_DIR}\\\\cmake\\\\nsis_logo.bmp")
	set(CPACK_NSIS_MUI_ICON "${CMAKE_CURRENT_SOURCE_DIR}\\\\cmake\\\\nsis_icon.ico")
	set(CPACK_NSIS_MUI_UNIICON "${CMAKE_CURRENT_SOURCE_DIR}\\\\cmake\\\\nsis_icon.ico")
	set(CPACK_NSIS_MODIFY_PATH "ON")
	set(CPACK_NSIS_ENABLE_UNINSTALL_BEFORE_INSTALL "ON")
	set(CPACK_NSIS_EXTRA_INSTALL_COMMANDS
	"ExecWait '$INSTDIR/tools/msbuild/install.bat'")
	set(CPACK_NSIS_EXTRA_UNINSTALL_COMMANDS
	"ExecWait '$INSTDIR/tools/msbuild/uninstall.bat'")
	if( CMAKE_CL_64 )
	set(CPACK_NSIS_INSTALL_ROOT "$PROGRAMFILES64")
	endif()
	endif()
	include(CPack)

	# Sanity check our source directory to make sure that we are not trying to
	# generate an in-tree build (unless on MSVC_IDE, where it is ok), and to make
	# sure that we don't have any stray generated files lying around in the tree
	# (which would end up getting picked up by header search, instead of the correct
	# versions).
	if( CMAKE_SOURCE_DIR STREQUAL CMAKE_BINARY_DIR AND NOT MSVC_IDE )
	message(FATAL_ERROR "In-source builds are not allowed.
	CMake would overwrite the makefiles distributed with LLVM.
	Please create a directory and run cmake from there, passing the path
	to this source directory as the last argument.
	This process created the file `CMakeCache.txt' and the directory `CMakeFiles'.
	Please delete them.")
	endif()
	if( NOT CMAKE_SOURCE_DIR STREQUAL CMAKE_BINARY_DIR )
	file(GLOB_RECURSE
	tablegenned_files_on_include_dir
	"${CMAKE_CURRENT_SOURCE_DIR}/include/llvm/*.gen")
	file(GLOB_RECURSE
	tablegenned_files_on_lib_dir
	"${CMAKE_CURRENT_SOURCE_DIR}/lib/Target/*.inc")
	if( tablegenned_files_on_include_dir OR tablegenned_files_on_lib_dir)
	message(FATAL_ERROR "Apparently there is a previous in-source build,
	probably as the result of running `configure' and `make' on
	${CMAKE_CURRENT_SOURCE_DIR}.
	This may cause problems. The suspicious files are:
	${tablegenned_files_on_lib_dir}
	${tablegenned_files_on_include_dir}
	Please clean the source directory.")
	endif()
	endif()

	string(TOUPPER "${CMAKE_BUILD_TYPE}" uppercase_CMAKE_BUILD_TYPE)

	if (CMAKE_BUILD_TYPE AND
	NOT uppercase_CMAKE_BUILD_TYPE MATCHES "^(DEBUG\|RELEASE\|RELWITHDEBINFO\|MINSIZEREL)$")
	message(FATAL_ERROR "Invalid value for CMAKE_BUILD_TYPE: ${CMAKE_BUILD_TYPE}")
	endif()

	set(LLVM_LIBDIR_SUFFIX "" CACHE STRING "Define suffix of library directory name (32/64)" )

	set(LLVM_TOOLS_INSTALL_DIR "bin" CACHE STRING "Path for binary subdirectory (defaults to 'bin')")
	mark_as_advanced(LLVM_TOOLS_INSTALL_DIR)

	set(LLVM_UTILS_INSTALL_DIR "bin" CACHE STRING
	"Path to install LLVM utilities (enabled by LLVM_INSTALL_UTILS=ON) (defaults to LLVM_TOOLS_INSTALL_DIR)")
	mark_as_advanced(LLVM_TOOLS_INSTALL_DIR)

	# They are used as destination of target generators.
	set(LLVM_RUNTIME_OUTPUT_INTDIR ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR}/bin)
	set(LLVM_LIBRARY_OUTPUT_INTDIR ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR}/lib${LLVM_LIBDIR_SUFFIX})
	if(WIN32 OR CYGWIN)
	# DLL platform -- put DLLs into bin.
	set(LLVM_SHLIB_OUTPUT_INTDIR ${LLVM_RUNTIME_OUTPUT_INTDIR})
	else()
	set(LLVM_SHLIB_OUTPUT_INTDIR ${LLVM_LIBRARY_OUTPUT_INTDIR})
	endif()

	# Each of them corresponds to llvm-config's.
	set(LLVM_TOOLS_BINARY_DIR ${LLVM_RUNTIME_OUTPUT_INTDIR}) # --bindir
	set(LLVM_LIBRARY_DIR ${LLVM_LIBRARY_OUTPUT_INTDIR}) # --libdir
	set(LLVM_MAIN_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR} ) # --src-root
	set(LLVM_MAIN_INCLUDE_DIR ${LLVM_MAIN_SRC_DIR}/include ) # --includedir
	set(LLVM_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR} ) # --prefix

	# Note: LLVM_CMAKE_PATH does not include generated files
	set(LLVM_CMAKE_PATH ${LLVM_MAIN_SRC_DIR}/cmake/modules)
	set(LLVM_EXAMPLES_BINARY_DIR ${LLVM_BINARY_DIR}/examples)
	set(LLVM_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/include)

	+# List of all targets to be built by default:
	set(LLVM_ALL_TARGETS
	AArch64
	AMDGPU
	ARM
	BPF
	Hexagon
	Lanai
	Mips
	MSP430
	NVPTX
	PowerPC
	- RISCV
	Sparc
	SystemZ
	X86
	XCore
	)

	# List of targets with JIT support:
	set(LLVM_TARGETS_WITH_JIT X86 PowerPC AArch64 ARM Mips SystemZ)

	set(LLVM_TARGETS_TO_BUILD "all"
	CACHE STRING "Semicolon-separated list of targets to build, or \"all\".")

	set(LLVM_EXPERIMENTAL_TARGETS_TO_BUILD ""
	CACHE STRING "Semicolon-separated list of experimental targets to build.")

	option(BUILD_SHARED_LIBS
	"Build all libraries as shared libraries instead of static" OFF)

	option(LLVM_ENABLE_BACKTRACES "Enable embedding backtraces on crash." ON)
	if(LLVM_ENABLE_BACKTRACES)
	set(ENABLE_BACKTRACES 1)
	endif()

	option(LLVM_ENABLE_CRASH_OVERRIDES "Enable crash overrides." ON)
	if(LLVM_ENABLE_CRASH_OVERRIDES)
	set(ENABLE_CRASH_OVERRIDES 1)
	endif()

	option(LLVM_ENABLE_FFI "Use libffi to call external functions from the interpreter" OFF)
	set(FFI_LIBRARY_DIR "" CACHE PATH "Additional directory, where CMake should search for libffi.so")
	set(FFI_INCLUDE_DIR "" CACHE PATH "Additional directory, where CMake should search for ffi.h or ffi/ffi.h")

	set(LLVM_TARGET_ARCH "host"
	CACHE STRING "Set target to use for LLVM JIT or use \"host\" for automatic detection.")

	option(LLVM_ENABLE_TERMINFO "Use terminfo database if available." ON)

	option(LLVM_ENABLE_LIBEDIT "Use libedit if available." ON)

	option(LLVM_ENABLE_THREADS "Use threads if available." ON)

	option(LLVM_ENABLE_ZLIB "Use zlib for compression/decompression if available." ON)

	if( LLVM_TARGETS_TO_BUILD STREQUAL "all" )
	set( LLVM_TARGETS_TO_BUILD ${LLVM_ALL_TARGETS} )
	endif()

	set(LLVM_TARGETS_TO_BUILD
	${LLVM_TARGETS_TO_BUILD}
	${LLVM_EXPERIMENTAL_TARGETS_TO_BUILD})
	list(REMOVE_DUPLICATES LLVM_TARGETS_TO_BUILD)

	option(LLVM_ENABLE_PIC "Build Position-Independent Code" ON)
	option(LLVM_ENABLE_WARNINGS "Enable compiler warnings." ON)
	option(LLVM_ENABLE_MODULES "Compile with C++ modules enabled." OFF)
	if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
	option(LLVM_ENABLE_MODULE_DEBUGGING "Compile with -gmodules." ON)
	option(LLVM_ENABLE_LOCAL_SUBMODULE_VISIBILITY "Compile with -fmodules-local-submodule-visibility." OFF)
	else()
	option(LLVM_ENABLE_MODULE_DEBUGGING "Compile with -gmodules." OFF)
	option(LLVM_ENABLE_LOCAL_SUBMODULE_VISIBILITY "Compile with -fmodules-local-submodule-visibility." ON)
	endif()
	option(LLVM_ENABLE_CXX1Y "Compile with C++1y enabled." OFF)
	option(LLVM_ENABLE_CXX1Z "Compile with C++1z enabled." OFF)
	option(LLVM_ENABLE_LIBCXX "Use libc++ if available." OFF)
	option(LLVM_ENABLE_LLD "Use lld as C and C++ linker." OFF)
	option(LLVM_ENABLE_PEDANTIC "Compile with pedantic enabled." ON)
	option(LLVM_ENABLE_WERROR "Fail and stop if a warning is triggered." OFF)

	if( NOT uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG" )
	option(LLVM_ENABLE_ASSERTIONS "Enable assertions" OFF)
	else()
	option(LLVM_ENABLE_ASSERTIONS "Enable assertions" ON)
	endif()

	option(LLVM_ENABLE_EXPENSIVE_CHECKS "Enable expensive checks" OFF)

	set(LLVM_ABI_BREAKING_CHECKS "WITH_ASSERTS" CACHE STRING
	"Enable abi-breaking checks. Can be WITH_ASSERTS, FORCE_ON or FORCE_OFF.")

	option(LLVM_FORCE_USE_OLD_HOST_TOOLCHAIN
	"Set to ON to force using an old, unsupported host toolchain." OFF)

	option(LLVM_USE_INTEL_JITEVENTS
	"Use Intel JIT API to inform Intel(R) VTune(TM) Amplifier XE 2011 about JIT code"
	OFF)

	if( LLVM_USE_INTEL_JITEVENTS )
	# Verify we are on a supported platform
	if( NOT CMAKE_SYSTEM_NAME MATCHES "Windows" AND NOT CMAKE_SYSTEM_NAME MATCHES "Linux" )
	message(FATAL_ERROR
	"Intel JIT API support is available on Linux and Windows only.")
	endif()
	endif( LLVM_USE_INTEL_JITEVENTS )

	option(LLVM_USE_OPROFILE
	"Use opagent JIT interface to inform OProfile about JIT code" OFF)

	option(LLVM_EXTERNALIZE_DEBUGINFO
	"Generate dSYM files and strip executables and libraries (Darwin Only)" OFF)

	# If enabled, verify we are on a platform that supports oprofile.
	if( LLVM_USE_OPROFILE )
	if( NOT CMAKE_SYSTEM_NAME MATCHES "Linux" )
	message(FATAL_ERROR "OProfile support is available on Linux only.")
	endif( NOT CMAKE_SYSTEM_NAME MATCHES "Linux" )
	endif( LLVM_USE_OPROFILE )

	set(LLVM_USE_SANITIZER "" CACHE STRING
	"Define the sanitizer used to build binaries and tests.")

	option(LLVM_USE_SPLIT_DWARF
	"Use -gsplit-dwarf when compiling llvm." OFF)

	option(LLVM_POLLY_LINK_INTO_TOOLS "Statically link Polly into tools (if available)" ON)
	option(LLVM_POLLY_BUILD "Build LLVM with Polly" ON)

	if (EXISTS ${LLVM_MAIN_SRC_DIR}/tools/polly/CMakeLists.txt)
	set(POLLY_IN_TREE TRUE)
	elseif(LLVM_EXTERNAL_POLLY_SOURCE_DIR)
	set(POLLY_IN_TREE TRUE)
	else()
	set(POLLY_IN_TREE FALSE)
	endif()

	if (LLVM_POLLY_BUILD AND POLLY_IN_TREE)
	set(WITH_POLLY ON)
	else()
	set(WITH_POLLY OFF)
	endif()

	if (LLVM_POLLY_LINK_INTO_TOOLS AND WITH_POLLY)
	set(LINK_POLLY_INTO_TOOLS ON)
	else()
	set(LINK_POLLY_INTO_TOOLS OFF)
	endif()

	# Define an option controlling whether we should build for 32-bit on 64-bit
	# platforms, where supported.
	if( CMAKE_SIZEOF_VOID_P EQUAL 8 AND NOT WIN32 )
	# TODO: support other platforms and toolchains.
	option(LLVM_BUILD_32_BITS "Build 32 bits executables and libraries." OFF)
	endif()

	# Define the default arguments to use with 'lit', and an option for the user to
	# override.
	set(LIT_ARGS_DEFAULT "-sv")
	if (MSVC_IDE OR XCODE)
	set(LIT_ARGS_DEFAULT "${LIT_ARGS_DEFAULT} --no-progress-bar")
	endif()
	set(LLVM_LIT_ARGS "${LIT_ARGS_DEFAULT}" CACHE STRING "Default options for lit")

	# On Win32 hosts, provide an option to specify the path to the GnuWin32 tools.
	if( WIN32 AND NOT CYGWIN )
	set(LLVM_LIT_TOOLS_DIR "" CACHE PATH "Path to GnuWin32 tools")
	endif()

	# Define options to control the inclusion and default build behavior for
	# components which may not strictly be necessary (tools, examples, and tests).
	#
	# This is primarily to support building smaller or faster project files.
	option(LLVM_INCLUDE_TOOLS "Generate build targets for the LLVM tools." ON)
	option(LLVM_BUILD_TOOLS
	"Build the LLVM tools. If OFF, just generate build targets." ON)

	option(LLVM_INCLUDE_UTILS "Generate build targets for the LLVM utils." ON)
	option(LLVM_BUILD_UTILS
	"Build LLVM utility binaries. If OFF, just generate build targets." ON)

	option(LLVM_INCLUDE_RUNTIMES "Generate build targets for the LLVM runtimes." ON)
	option(LLVM_BUILD_RUNTIMES
	"Build the LLVM runtimes. If OFF, just generate build targets." ON)

	option(LLVM_BUILD_RUNTIME
	"Build the LLVM runtime libraries." ON)
	option(LLVM_BUILD_EXAMPLES
	"Build the LLVM example programs. If OFF, just generate build targets." OFF)
	option(LLVM_INCLUDE_EXAMPLES "Generate build targets for the LLVM examples" ON)

	option(LLVM_BUILD_TESTS
	"Build LLVM unit tests. If OFF, just generate build targets." OFF)
	option(LLVM_INCLUDE_TESTS "Generate build targets for the LLVM unit tests." ON)
	option(LLVM_INCLUDE_GO_TESTS "Include the Go bindings tests in test build targets." ON)

	option (LLVM_BUILD_DOCS "Build the llvm documentation." OFF)
	option (LLVM_INCLUDE_DOCS "Generate build targets for llvm documentation." ON)
	option (LLVM_ENABLE_DOXYGEN "Use doxygen to generate llvm API documentation." OFF)
	option (LLVM_ENABLE_SPHINX "Use Sphinx to generate llvm documentation." OFF)
	option (LLVM_ENABLE_OCAMLDOC "Build OCaml bindings documentation." ON)

	set(LLVM_INSTALL_DOXYGEN_HTML_DIR "share/doc/llvm/doxygen-html"
	CACHE STRING "Doxygen-generated HTML documentation install directory")
	set(LLVM_INSTALL_OCAMLDOC_HTML_DIR "share/doc/llvm/ocaml-html"
	CACHE STRING "OCamldoc-generated HTML documentation install directory")

	option (LLVM_BUILD_EXTERNAL_COMPILER_RT
	"Build compiler-rt as an external project." OFF)

	option (LLVM_VERSION_PRINTER_SHOW_HOST_TARGET_INFO
	"Show target and host info when tools are invoked with --version." ON)

	# You can configure which libraries from LLVM you want to include in the
	# shared library by setting LLVM_DYLIB_COMPONENTS to a semi-colon delimited
	# list of LLVM components. All component names handled by llvm-config are valid.
	if(NOT DEFINED LLVM_DYLIB_COMPONENTS)
	set(LLVM_DYLIB_COMPONENTS "all" CACHE STRING
	"Semicolon-separated list of components to include in libLLVM, or \"all\".")
	endif()
	option(LLVM_LINK_LLVM_DYLIB "Link tools against the libllvm dynamic library" OFF)
	option(LLVM_BUILD_LLVM_C_DYLIB "Build libllvm-c re-export library (Darwin Only)" OFF)
	set(LLVM_BUILD_LLVM_DYLIB_default OFF)
	if(LLVM_LINK_LLVM_DYLIB OR LLVM_BUILD_LLVM_C_DYLIB)
	set(LLVM_BUILD_LLVM_DYLIB_default ON)
	endif()
	option(LLVM_BUILD_LLVM_DYLIB "Build libllvm dynamic library" ${LLVM_BUILD_LLVM_DYLIB_default})

	option(LLVM_DYLIB_SYMBOL_VERSIONING OFF)

	option(LLVM_OPTIMIZED_TABLEGEN "Force TableGen to be built with optimization" OFF)
	if(CMAKE_CROSSCOMPILING OR (LLVM_OPTIMIZED_TABLEGEN AND (LLVM_ENABLE_ASSERTIONS OR CMAKE_CONFIGURATION_TYPES)))
	set(LLVM_USE_HOST_TOOLS ON)
	endif()

	if (MSVC_IDE AND NOT (MSVC_VERSION LESS 1900))
	option(LLVM_ADD_NATIVE_VISUALIZERS_TO_SOLUTION "Configure project to use Visual Studio native visualizers" TRUE)
	else()
	set(LLVM_ADD_NATIVE_VISUALIZERS_TO_SOLUTION FALSE CACHE INTERNAL "For Visual Studio 2013, manually copy natvis files to Documents\\Visual Studio 2013\\Visualizers" FORCE)
	endif()

	if (LLVM_BUILD_INSTRUMENTED OR LLVM_BUILD_INSTRUMENTED_COVERAGE)
	if(NOT LLVM_PROFILE_MERGE_POOL_SIZE)
	# A pool size of 1-2 is probably sufficient on a SSD. 3-4 should be fine
	# for spining disks. Anything higher may only help on slower mediums.
	set(LLVM_PROFILE_MERGE_POOL_SIZE "4")
	endif()
	if(NOT LLVM_PROFILE_FILE_PATTERN)
	if(NOT LLVM_PROFILE_DATA_DIR)
	file(TO_NATIVE_PATH "${LLVM_BINARY_DIR}/profiles/%${LLVM_PROFILE_MERGE_POOL_SIZE}m.profraw" LLVM_PROFILE_FILE_PATTERN)
	else()
	file(TO_NATIVE_PATH "${LLVM_PROFILE_DATA_DIR}/%${LLVM_PROFILE_MERGE_POOL_SIZE}m.profraw" LLVM_PROFILE_FILE_PATTERN)
	endif()
	endif()
	endif()

	if (LLVM_BUILD_STATIC)
	set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static")
	endif()

	# Override the default target with an environment variable named by LLVM_TARGET_TRIPLE_ENV.
	set(LLVM_TARGET_TRIPLE_ENV CACHE STRING "The name of environment variable to override default target. Disabled by blank.")
	mark_as_advanced(LLVM_TARGET_TRIPLE_ENV)

	# All options referred to from HandleLLVMOptions have to be specified
	# BEFORE this include, otherwise options will not be correctly set on
	# first cmake run
	include(config-ix)

	string(REPLACE "Native" ${LLVM_NATIVE_ARCH}
	LLVM_TARGETS_TO_BUILD "${LLVM_TARGETS_TO_BUILD}")
	list(REMOVE_DUPLICATES LLVM_TARGETS_TO_BUILD)

	# By default, we target the host, but this can be overridden at CMake
	# invocation time.
	set(LLVM_DEFAULT_TARGET_TRIPLE "${LLVM_HOST_TRIPLE}" CACHE STRING
	"Default target for which LLVM will generate code." )
	set(TARGET_TRIPLE "${LLVM_DEFAULT_TARGET_TRIPLE}")
	message(STATUS "LLVM host triple: ${LLVM_HOST_TRIPLE}")
	message(STATUS "LLVM default target triple: ${LLVM_DEFAULT_TARGET_TRIPLE}")

	include(HandleLLVMOptions)

	# Verify that we can find a Python 2 interpreter. Python 3 is unsupported.
	# FIXME: We should support systems with only Python 3, but that requires work
	# on LLDB.
	set(Python_ADDITIONAL_VERSIONS 2.7)
	include(FindPythonInterp)
	if( NOT PYTHONINTERP_FOUND )
	message(FATAL_ERROR
	"Unable to find Python interpreter, required for builds and testing.

	Please install Python or specify the PYTHON_EXECUTABLE CMake variable.")
	endif()

	if( ${PYTHON_VERSION_STRING} VERSION_LESS 2.7 )
	message(FATAL_ERROR "Python 2.7 or newer is required")
	endif()

	######
	# LLVMBuild Integration
	#
	# We use llvm-build to generate all the data required by the CMake based
	# build system in one swoop:
	#
	# - We generate a file (a CMake fragment) in the object root which contains
	# all the definitions that are required by CMake.
	#
	# - We generate the library table used by llvm-config.
	#
	# - We generate the dependencies for the CMake fragment, so that we will
	# automatically reconfigure outselves.

	set(LLVMBUILDTOOL "${LLVM_MAIN_SRC_DIR}/utils/llvm-build/llvm-build")
	set(LLVMCONFIGLIBRARYDEPENDENCIESINC
	"${LLVM_BINARY_DIR}/tools/llvm-config/LibraryDependencies.inc")
	set(LLVMBUILDCMAKEFRAG
	"${LLVM_BINARY_DIR}/LLVMBuild.cmake")

	# Create the list of optional components that are enabled
	if (LLVM_USE_INTEL_JITEVENTS)
	set(LLVMOPTIONALCOMPONENTS IntelJITEvents)
	endif (LLVM_USE_INTEL_JITEVENTS)
	if (LLVM_USE_OPROFILE)
	set(LLVMOPTIONALCOMPONENTS ${LLVMOPTIONALCOMPONENTS} OProfileJIT)
	endif (LLVM_USE_OPROFILE)

	message(STATUS "Constructing LLVMBuild project information")
	execute_process(
	COMMAND ${PYTHON_EXECUTABLE} -B ${LLVMBUILDTOOL}
	--native-target "${LLVM_NATIVE_ARCH}"
	--enable-targets "${LLVM_TARGETS_TO_BUILD}"
	--enable-optional-components "${LLVMOPTIONALCOMPONENTS}"
	--write-library-table ${LLVMCONFIGLIBRARYDEPENDENCIESINC}
	--write-cmake-fragment ${LLVMBUILDCMAKEFRAG}
	OUTPUT_VARIABLE LLVMBUILDOUTPUT
	ERROR_VARIABLE LLVMBUILDERRORS
	OUTPUT_STRIP_TRAILING_WHITESPACE
	ERROR_STRIP_TRAILING_WHITESPACE
	RESULT_VARIABLE LLVMBUILDRESULT)

	# On Win32, CMake doesn't properly handle piping the default output/error
	# streams into the GUI console. So, we explicitly catch and report them.
	if( NOT "${LLVMBUILDOUTPUT}" STREQUAL "")
	message(STATUS "llvm-build output: ${LLVMBUILDOUTPUT}")
	endif()
	if( NOT "${LLVMBUILDRESULT}" STREQUAL "0" )
	message(FATAL_ERROR
	"Unexpected failure executing llvm-build: ${LLVMBUILDERRORS}")
	endif()

	# Include the generated CMake fragment. This will define properties from the
	# LLVMBuild files in a format which is easy to consume from CMake, and will add
	# the dependencies so that CMake will reconfigure properly when the LLVMBuild
	# files change.
	include(${LLVMBUILDCMAKEFRAG})

	######

	# Configure all of the various header file fragments LLVM uses which depend on
	# configuration variables.
	set(LLVM_ENUM_TARGETS "")
	set(LLVM_ENUM_ASM_PRINTERS "")
	set(LLVM_ENUM_ASM_PARSERS "")
	set(LLVM_ENUM_DISASSEMBLERS "")
	foreach(t ${LLVM_TARGETS_TO_BUILD})
	set( td ${LLVM_MAIN_SRC_DIR}/lib/Target/${t} )

	list(FIND LLVM_ALL_TARGETS ${t} idx)
	list(FIND LLVM_EXPERIMENTAL_TARGETS_TO_BUILD ${t} idy)
	if( idx LESS 0 AND idy LESS 0 )
	message(FATAL_ERROR "The target `${t}' does not exist.
	It should be one of\n${LLVM_ALL_TARGETS}")
	else()
	set(LLVM_ENUM_TARGETS "${LLVM_ENUM_TARGETS}LLVM_TARGET(${t})\n")
	endif()

	file(GLOB asmp_file "${td}/*AsmPrinter.cpp")
	if( asmp_file )
	set(LLVM_ENUM_ASM_PRINTERS
	"${LLVM_ENUM_ASM_PRINTERS}LLVM_ASM_PRINTER(${t})\n")
	endif()
	if( EXISTS ${td}/AsmParser/CMakeLists.txt )
	set(LLVM_ENUM_ASM_PARSERS
	"${LLVM_ENUM_ASM_PARSERS}LLVM_ASM_PARSER(${t})\n")
	endif()
	if( EXISTS ${td}/Disassembler/CMakeLists.txt )
	set(LLVM_ENUM_DISASSEMBLERS
	"${LLVM_ENUM_DISASSEMBLERS}LLVM_DISASSEMBLER(${t})\n")
	endif()
	endforeach(t)

	# Produce the target definition files, which provide a way for clients to easily
	# include various classes of targets.
	configure_file(
	${LLVM_MAIN_INCLUDE_DIR}/llvm/Config/AsmPrinters.def.in
	${LLVM_INCLUDE_DIR}/llvm/Config/AsmPrinters.def
	)
	configure_file(
	${LLVM_MAIN_INCLUDE_DIR}/llvm/Config/AsmParsers.def.in
	${LLVM_INCLUDE_DIR}/llvm/Config/AsmParsers.def
	)
	configure_file(
	${LLVM_MAIN_INCLUDE_DIR}/llvm/Config/Disassemblers.def.in
	${LLVM_INCLUDE_DIR}/llvm/Config/Disassemblers.def
	)
	configure_file(
	${LLVM_MAIN_INCLUDE_DIR}/llvm/Config/Targets.def.in
	${LLVM_INCLUDE_DIR}/llvm/Config/Targets.def
	)

	# Configure the three LLVM configuration header files.
	configure_file(
	${LLVM_MAIN_INCLUDE_DIR}/llvm/Config/config.h.cmake
	${LLVM_INCLUDE_DIR}/llvm/Config/config.h)
	configure_file(
	${LLVM_MAIN_INCLUDE_DIR}/llvm/Config/llvm-config.h.cmake
	${LLVM_INCLUDE_DIR}/llvm/Config/llvm-config.h)
	configure_file(
	${LLVM_MAIN_INCLUDE_DIR}/llvm/Config/abi-breaking.h.cmake
	${LLVM_INCLUDE_DIR}/llvm/Config/abi-breaking.h)
	configure_file(
	${LLVM_MAIN_INCLUDE_DIR}/llvm/Support/DataTypes.h.cmake
	${LLVM_INCLUDE_DIR}/llvm/Support/DataTypes.h)

	# Add target for generating source rpm package.
	set(LLVM_SRPM_USER_BINARY_SPECFILE ${CMAKE_CURRENT_SOURCE_DIR}/llvm.spec.in
	CACHE FILEPATH ".spec file to use for srpm generation")
	set(LLVM_SRPM_BINARY_SPECFILE ${CMAKE_CURRENT_BINARY_DIR}/llvm.spec)
	set(LLVM_SRPM_DIR "${CMAKE_CURRENT_BINARY_DIR}/srpm")

	# SVN_REVISION and GIT_COMMIT get set by the call to add_version_info_from_vcs.
	# DUMMY_VAR contains a version string which we don't care about.
	add_version_info_from_vcs(DUMMY_VAR)
	if ( SVN_REVISION )
	set(LLVM_RPM_SPEC_REVISION "r${SVN_REVISION}")
	elseif ( GIT_COMMIT )
	set (LLVM_RPM_SPEC_REVISION "g${GIT_COMMIT}")
	endif()

	configure_file(
	${LLVM_SRPM_USER_BINARY_SPECFILE}
	${LLVM_SRPM_BINARY_SPECFILE} @ONLY)

	add_custom_target(srpm
	COMMAND cpack -G TGZ --config CPackSourceConfig.cmake -B ${LLVM_SRPM_DIR}/SOURCES
	COMMAND rpmbuild -bs --define '_topdir ${LLVM_SRPM_DIR}' ${LLVM_SRPM_BINARY_SPECFILE})


	# They are not referenced. See set_output_directory().
	set( CMAKE_RUNTIME_OUTPUT_DIRECTORY ${LLVM_BINARY_DIR}/bin )
	set( CMAKE_LIBRARY_OUTPUT_DIRECTORY ${LLVM_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX} )
	set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${LLVM_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX} )

	if(APPLE AND DARWIN_LTO_LIBRARY)
	set(CMAKE_EXE_LINKER_FLAGS
	"${CMAKE_EXE_LINKER_FLAGS} -Wl,-lto_library -Wl,${DARWIN_LTO_LIBRARY}")
	set(CMAKE_SHARED_LINKER_FLAGS
	"${CMAKE_SHARED_LINKER_FLAGS} -Wl,-lto_library -Wl,${DARWIN_LTO_LIBRARY}")
	set(CMAKE_MODULE_LINKER_FLAGS
	"${CMAKE_MODULE_LINKER_FLAGS} -Wl,-lto_library -Wl,${DARWIN_LTO_LIBRARY}")
	endif()

	# Work around a broken bfd ld behavior. When linking a binary with a
	# foo.so library, it will try to find any library that foo.so uses and
	# check its symbols. This is wasteful (the check was done when foo.so
	# was created) and can fail since it is not the dynamic linker and
	# doesn't know how to handle search paths correctly.
	if (UNIX AND NOT APPLE AND NOT ${CMAKE_SYSTEM_NAME} MATCHES "SunOS\|AIX")
	set(CMAKE_EXE_LINKER_FLAGS
	"${CMAKE_EXE_LINKER_FLAGS} -Wl,-allow-shlib-undefined")
	endif()

	set(CMAKE_INCLUDE_CURRENT_DIR ON)

	include_directories( ${LLVM_INCLUDE_DIR} ${LLVM_MAIN_INCLUDE_DIR})

	# when crosscompiling import the executable targets from a file
	if(LLVM_USE_HOST_TOOLS)
	include(CrossCompile)
	endif(LLVM_USE_HOST_TOOLS)
	if(LLVM_TARGET_IS_CROSSCOMPILE_HOST)
	# Dummy use to avoid CMake Wraning: Manually-specified variables were not used
	# (this is a variable that CrossCompile sets on recursive invocations)
	endif()

	if(${CMAKE_SYSTEM_NAME} MATCHES "(FreeBSD\|DragonFly)")
	# On FreeBSD, /usr/local/* is not used by default. In order to build LLVM
	# with libxml2, iconv.h, etc., we must add /usr/local paths.
	include_directories("/usr/local/include")
	link_directories("/usr/local/lib")
	endif(${CMAKE_SYSTEM_NAME} MATCHES "(FreeBSD\|DragonFly)")

	if( ${CMAKE_SYSTEM_NAME} MATCHES SunOS )
	# special hack for Solaris to handle crazy system sys/regset.h
	include_directories("${LLVM_MAIN_INCLUDE_DIR}/llvm/Support/Solaris")
	endif( ${CMAKE_SYSTEM_NAME} MATCHES SunOS )

	# Make sure we don't get -rdynamic in every binary. For those that need it,
	# use export_executable_symbols(target).
	set(CMAKE_SHARED_LIBRARY_LINK_CXX_FLAGS "")

	set(LLVM_PROFDATA_FILE "" CACHE FILEPATH
	"Profiling data file to use when compiling in order to improve runtime performance.")

	if(LLVM_PROFDATA_FILE AND EXISTS ${LLVM_PROFDATA_FILE})
	if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang" )
	add_definitions("-fprofile-instr-use=${LLVM_PROFDATA_FILE}")
	else()
	message(FATAL_ERROR "LLVM_PROFDATA_FILE can only be specified when compiling with clang")
	endif()
	endif()

	include(AddLLVM)
	include(TableGen)

	if( MINGW )
	# People report that -O3 is unreliable on MinGW. The traditional
	# build also uses -O2 for that reason:
	llvm_replace_compiler_option(CMAKE_CXX_FLAGS_RELEASE "-O3" "-O2")
	endif()

	# Put this before tblgen. Else we have a circular dependence.
	add_subdirectory(lib/Demangle)
	add_subdirectory(lib/Support)
	add_subdirectory(lib/TableGen)

	add_subdirectory(utils/TableGen)

	# Force target to be built as soon as possible. Clang modules builds depend
	# header-wise on it as they ship all headers from the umbrella folders. Building
	# an entire module might include header, which depends on intrinsics_gen. This
	# should be right after LLVMSupport and LLVMTableGen otherwise we introduce a
	# circular dependence.
	if (LLVM_ENABLE_MODULES)
	list(APPEND LLVM_COMMON_DEPENDS intrinsics_gen)
	endif(LLVM_ENABLE_MODULES)

	add_subdirectory(include/llvm)

	add_subdirectory(lib)

	if( LLVM_INCLUDE_UTILS )
	add_subdirectory(utils/FileCheck)
	add_subdirectory(utils/PerfectShuffle)
	add_subdirectory(utils/count)
	add_subdirectory(utils/not)
	add_subdirectory(utils/llvm-lit)
	add_subdirectory(utils/yaml-bench)
	else()
	if ( LLVM_INCLUDE_TESTS )
	message(FATAL_ERROR "Including tests when not building utils will not work.
	Either set LLVM_INCLUDE_UTILS to On, or set LLVM_INCLDE_TESTS to Off.")
	endif()
	endif()

	# Use LLVM_ADD_NATIVE_VISUALIZERS_TO_SOLUTION instead of LLVM_INCLUDE_UTILS because it is not really a util
	if (LLVM_ADD_NATIVE_VISUALIZERS_TO_SOLUTION)
	add_subdirectory(utils/LLVMVisualizers)
	endif()

	foreach( binding ${LLVM_BINDINGS_LIST} )
	if( EXISTS "${LLVM_MAIN_SRC_DIR}/bindings/${binding}/CMakeLists.txt" )
	add_subdirectory(bindings/${binding})
	endif()
	endforeach()

	add_subdirectory(projects)

	if( LLVM_INCLUDE_TOOLS )
	add_subdirectory(tools)
	endif()

	if( LLVM_INCLUDE_RUNTIMES )
	add_subdirectory(runtimes)
	endif()

	if( LLVM_INCLUDE_EXAMPLES )
	add_subdirectory(examples)
	endif()

	if( LLVM_INCLUDE_TESTS )
	if(EXISTS ${LLVM_MAIN_SRC_DIR}/projects/test-suite AND TARGET clang)
	include(LLVMExternalProjectUtils)
	llvm_ExternalProject_Add(test-suite ${LLVM_MAIN_SRC_DIR}/projects/test-suite
	USE_TOOLCHAIN
	EXCLUDE_FROM_ALL
	NO_INSTALL
	ALWAYS_CLEAN)
	endif()
	add_subdirectory(test)
	add_subdirectory(unittests)
	if( LLVM_INCLUDE_UTILS )
	add_subdirectory(utils/unittest)
	endif()

	if (WIN32)
	# This utility is used to prevent crashing tests from calling Dr. Watson on
	# Windows.
	add_subdirectory(utils/KillTheDoctor)
	endif()

	# Add a global check rule now that all subdirectories have been traversed
	# and we know the total set of lit testsuites.
	get_property(LLVM_LIT_TESTSUITES GLOBAL PROPERTY LLVM_LIT_TESTSUITES)
	get_property(LLVM_LIT_PARAMS GLOBAL PROPERTY LLVM_LIT_PARAMS)
	get_property(LLVM_LIT_DEPENDS GLOBAL PROPERTY LLVM_LIT_DEPENDS)
	get_property(LLVM_LIT_EXTRA_ARGS GLOBAL PROPERTY LLVM_LIT_EXTRA_ARGS)
	get_property(LLVM_ADDITIONAL_TEST_TARGETS
	GLOBAL PROPERTY LLVM_ADDITIONAL_TEST_TARGETS)
	get_property(LLVM_ADDITIONAL_TEST_DEPENDS
	GLOBAL PROPERTY LLVM_ADDITIONAL_TEST_DEPENDS)
	add_lit_target(check-all
	"Running all regression tests"
	${LLVM_LIT_TESTSUITES}
	PARAMS ${LLVM_LIT_PARAMS}
	DEPENDS ${LLVM_LIT_DEPENDS} ${LLVM_ADDITIONAL_TEST_TARGETS}
	ARGS ${LLVM_LIT_EXTRA_ARGS}
	)
	if(TARGET check-runtimes)
	add_dependencies(check-all check-runtimes)
	endif()
	add_custom_target(test-depends
	DEPENDS ${LLVM_LIT_DEPENDS} ${LLVM_ADDITIONAL_TEST_DEPENDS})
	set_target_properties(test-depends PROPERTIES FOLDER "Tests")
	endif()

	if (LLVM_INCLUDE_DOCS)
	add_subdirectory(docs)
	endif()

	add_subdirectory(cmake/modules)

	if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY)
	install(DIRECTORY include/llvm include/llvm-c
	DESTINATION include
	COMPONENT llvm-headers
	FILES_MATCHING
	PATTERN "*.def"
	PATTERN "*.h"
	PATTERN "*.td"
	PATTERN "*.inc"
	PATTERN "LICENSE.TXT"
	PATTERN ".svn" EXCLUDE
	)

	install(DIRECTORY ${LLVM_INCLUDE_DIR}/llvm
	DESTINATION include
	COMPONENT llvm-headers
	FILES_MATCHING
	PATTERN "*.def"
	PATTERN "*.h"
	PATTERN "*.gen"
	PATTERN "*.inc"
	# Exclude include/llvm/CMakeFiles/intrinsics_gen.dir, matched by "*.def"
	PATTERN "CMakeFiles" EXCLUDE
	PATTERN "config.h" EXCLUDE
	PATTERN ".svn" EXCLUDE
	)

	# Installing the headers needs to depend on generating any public
	# tablegen'd headers.
	add_custom_target(llvm-headers DEPENDS intrinsics_gen)

	if (NOT CMAKE_CONFIGURATION_TYPES)
	add_custom_target(install-llvm-headers
	DEPENDS llvm-headers
	COMMAND "${CMAKE_COMMAND}"
	-DCMAKE_INSTALL_COMPONENT=llvm-headers
	-P "${CMAKE_BINARY_DIR}/cmake_install.cmake")
	endif()
	endif()

	# This must be at the end of the LLVM root CMakeLists file because it must run
	# after all targets are created.
	if(LLVM_DISTRIBUTION_COMPONENTS)
	if(CMAKE_CONFIGURATION_TYPES)
	message(FATAL_ERROR "LLVM_DISTRIBUTION_COMPONENTS cannot be specified with multi-configuration generators (i.e. Xcode or Visual Studio)")
	endif()

	add_custom_target(distribution)
	add_custom_target(install-distribution)
	foreach(target ${LLVM_DISTRIBUTION_COMPONENTS})
	if(TARGET ${target})
	add_dependencies(distribution ${target})
	else()
	message(FATAL_ERROR "Specified distribution component '${target}' doesn't have a target")
	endif()

	if(TARGET install-${target})
	add_dependencies(install-distribution install-${target})
	else()
	message(FATAL_ERROR "Specified distribution component '${target}' doesn't have an install target")
	endif()
	endforeach()
	endif()

	# This allows us to deploy the Universal CRT DLLs by passing -DCMAKE_INSTALL_UCRT_LIBRARIES=ON to CMake
	if (MSVC)
	include(InstallRequiredSystemLibraries)
	endif()
	diff --git a/docs/LangRef.rst b/docs/LangRef.rst
	index 44efc1498060..5c65864e901e 100644
	--- a/docs/LangRef.rst
	+++ b/docs/LangRef.rst
	@@ -1,14335 +1,14339 @@
	==============================
	LLVM Language Reference Manual
	==============================

	.. contents::
	:local:
	:depth: 4

	Abstract
	========

	This document is a reference manual for the LLVM assembly language. LLVM
	is a Static Single Assignment (SSA) based representation that provides
	type safety, low-level operations, flexibility, and the capability of
	representing 'all' high-level languages cleanly. It is the common code
	representation used throughout all phases of the LLVM compilation
	strategy.

	Introduction
	============

	The LLVM code representation is designed to be used in three different
	forms: as an in-memory compiler IR, as an on-disk bitcode representation
	(suitable for fast loading by a Just-In-Time compiler), and as a human
	readable assembly language representation. This allows LLVM to provide a
	powerful intermediate representation for efficient compiler
	transformations and analysis, while providing a natural means to debug
	and visualize the transformations. The three different forms of LLVM are
	all equivalent. This document describes the human readable
	representation and notation.

	The LLVM representation aims to be light-weight and low-level while
	being expressive, typed, and extensible at the same time. It aims to be
	a "universal IR" of sorts, by being at a low enough level that
	high-level ideas may be cleanly mapped to it (similar to how
	microprocessors are "universal IR's", allowing many source languages to
	be mapped to them). By providing type information, LLVM can be used as
	the target of optimizations: for example, through pointer analysis, it
	can be proven that a C automatic variable is never accessed outside of
	the current function, allowing it to be promoted to a simple SSA value
	instead of a memory location.

	.. _wellformed:

	Well-Formedness
	---------------

	It is important to note that this document describes 'well formed' LLVM
	assembly language. There is a difference between what the parser accepts
	and what is considered 'well formed'. For example, the following
	instruction is syntactically okay, but not well formed:

	.. code-block:: llvm

	%x = add i32 1, %x

	because the definition of ``%x`` does not dominate all of its uses. The
	LLVM infrastructure provides a verification pass that may be used to
	verify that an LLVM module is well formed. This pass is automatically
	run by the parser after parsing input assembly and by the optimizer
	before it outputs bitcode. The violations pointed out by the verifier
	pass indicate bugs in transformation passes or input to the parser.

	.. _identifiers:

	Identifiers
	===========

	LLVM identifiers come in two basic types: global and local. Global
	identifiers (functions, global variables) begin with the ``'@'``
	character. Local identifiers (register names, types) begin with the
	``'%'`` character. Additionally, there are three different formats for
	identifiers, for different purposes:

	#. Named values are represented as a string of characters with their
	prefix. For example, ``%foo``, ``@DivisionByZero``,
	``%a.really.long.identifier``. The actual regular expression used is
	'``[%@][-a-zA-Z$._][-a-zA-Z$._0-9]*``'. Identifiers that require other
	characters in their names can be surrounded with quotes. Special
	characters may be escaped using ``"\xx"`` where ``xx`` is the ASCII
	code for the character in hexadecimal. In this way, any character can
	be used in a name value, even quotes themselves. The ``"\01"`` prefix
	can be used on global variables to suppress mangling.
	#. Unnamed values are represented as an unsigned numeric value with
	their prefix. For example, ``%12``, ``@2``, ``%44``.
	#. Constants, which are described in the section Constants_ below.

	LLVM requires that values start with a prefix for two reasons: Compilers
	don't need to worry about name clashes with reserved words, and the set
	of reserved words may be expanded in the future without penalty.
	Additionally, unnamed identifiers allow a compiler to quickly come up
	with a temporary variable without having to avoid symbol table
	conflicts.

	Reserved words in LLVM are very similar to reserved words in other
	languages. There are keywords for different opcodes ('``add``',
	'``bitcast``', '``ret``', etc...), for primitive type names ('``void``',
	'``i32``', etc...), and others. These reserved words cannot conflict
	with variable names, because none of them start with a prefix character
	(``'%'`` or ``'@'``).

	Here is an example of LLVM code to multiply the integer variable
	'``%X``' by 8:

	The easy way:

	.. code-block:: llvm

	%result = mul i32 %X, 8

	After strength reduction:

	.. code-block:: llvm

	%result = shl i32 %X, 3

	And the hard way:

	.. code-block:: llvm

	%0 = add i32 %X, %X ; yields i32:%0
	%1 = add i32 %0, %0 ; yields i32:%1
	%result = add i32 %1, %1

	This last way of multiplying ``%X`` by 8 illustrates several important
	lexical features of LLVM:

	#. Comments are delimited with a '``;``' and go until the end of line.
	#. Unnamed temporaries are created when the result of a computation is
	not assigned to a named value.
	#. Unnamed temporaries are numbered sequentially (using a per-function
	incrementing counter, starting with 0). Note that basic blocks and unnamed
	function parameters are included in this numbering. For example, if the
	entry basic block is not given a label name and all function parameters are
	named, then it will get number 0.

	It also shows a convention that we follow in this document. When
	demonstrating instructions, we will follow an instruction with a comment
	that defines the type and name of value produced.

	High Level Structure
	====================

	Module Structure
	----------------

	LLVM programs are composed of ``Module``'s, each of which is a
	translation unit of the input programs. Each module consists of
	functions, global variables, and symbol table entries. Modules may be
	combined together with the LLVM linker, which merges function (and
	global variable) definitions, resolves forward declarations, and merges
	symbol table entries. Here is an example of the "hello world" module:

	.. code-block:: llvm

	; Declare the string constant as a global constant.
	@.str = private unnamed_addr constant [13 x i8] c"hello world\0A\00"

	; External declaration of the puts function
	declare i32 @puts(i8* nocapture) nounwind

	; Definition of main function
	define i32 @main() { ; i32()*
	; Convert [13 x i8]* to i8*...
	%cast210 = getelementptr [13 x i8], [13 x i8]* @.str, i64 0, i64 0

	; Call puts function to write out the string to stdout.
	call i32 @puts(i8* %cast210)
	ret i32 0
	}

	; Named metadata
	!0 = !{i32 42, null, !"string"}
	!foo = !{!0}

	This example is made up of a :ref:`global variable <globalvars>` named
	"``.str``", an external declaration of the "``puts``" function, a
	:ref:`function definition <functionstructure>` for "``main``" and
	:ref:`named metadata <namedmetadatastructure>` "``foo``".

	In general, a module is made up of a list of global values (where both
	functions and global variables are global values). Global values are
	represented by a pointer to a memory location (in this case, a pointer
	to an array of char, and a pointer to a function), and have one of the
	following :ref:`linkage types <linkage>`.

	.. _linkage:

	Linkage Types
	-------------

	All Global Variables and Functions have one of the following types of
	linkage:

	``private``
	Global values with "``private``" linkage are only directly
	accessible by objects in the current module. In particular, linking
	code into a module with a private global value may cause the
	private to be renamed as necessary to avoid collisions. Because the
	symbol is private to the module, all references can be updated. This
	doesn't show up in any symbol table in the object file.
	``internal``
	Similar to private, but the value shows as a local symbol
	(``STB_LOCAL`` in the case of ELF) in the object file. This
	corresponds to the notion of the '``static``' keyword in C.
	``available_externally``
	Globals with "``available_externally``" linkage are never emitted into
	the object file corresponding to the LLVM module. From the linker's
	perspective, an ``available_externally`` global is equivalent to
	an external declaration. They exist to allow inlining and other
	optimizations to take place given knowledge of the definition of the
	global, which is known to be somewhere outside the module. Globals
	with ``available_externally`` linkage are allowed to be discarded at
	will, and allow inlining and other optimizations. This linkage type is
	only allowed on definitions, not declarations.
	``linkonce``
	Globals with "``linkonce``" linkage are merged with other globals of
	the same name when linkage occurs. This can be used to implement
	some forms of inline functions, templates, or other code which must
	be generated in each translation unit that uses it, but where the
	body may be overridden with a more definitive definition later.
	Unreferenced ``linkonce`` globals are allowed to be discarded. Note
	that ``linkonce`` linkage does not actually allow the optimizer to
	inline the body of this function into callers because it doesn't
	know if this definition of the function is the definitive definition
	within the program or whether it will be overridden by a stronger
	definition. To enable inlining and other optimizations, use
	"``linkonce_odr``" linkage.
	``weak``
	"``weak``" linkage has the same merging semantics as ``linkonce``
	linkage, except that unreferenced globals with ``weak`` linkage may
	not be discarded. This is used for globals that are declared "weak"
	in C source code.
	``common``
	"``common``" linkage is most similar to "``weak``" linkage, but they
	are used for tentative definitions in C, such as "``int X;``" at
	global scope. Symbols with "``common``" linkage are merged in the
	same way as ``weak symbols``, and they may not be deleted if
	unreferenced. ``common`` symbols may not have an explicit section,
	must have a zero initializer, and may not be marked
	':ref:`constant <globalvars>`'. Functions and aliases may not have
	common linkage.

	.. _linkage_appending:

	``appending``
	"``appending``" linkage may only be applied to global variables of
	pointer to array type. When two global variables with appending
	linkage are linked together, the two global arrays are appended
	together. This is the LLVM, typesafe, equivalent of having the
	system linker append together "sections" with identical names when
	.o files are linked.

	Unfortunately this doesn't correspond to any feature in .o files, so it
	can only be used for variables like ``llvm.global_ctors`` which llvm
	interprets specially.

	``extern_weak``
	The semantics of this linkage follow the ELF object file model: the
	symbol is weak until linked, if not linked, the symbol becomes null
	instead of being an undefined reference.
	``linkonce_odr``, ``weak_odr``
	Some languages allow differing globals to be merged, such as two
	functions with different semantics. Other languages, such as
	``C++``, ensure that only equivalent globals are ever merged (the
	"one definition rule" --- "ODR"). Such languages can use the
	``linkonce_odr`` and ``weak_odr`` linkage types to indicate that the
	global will only be merged with equivalent globals. These linkage
	types are otherwise the same as their non-``odr`` versions.
	``external``
	If none of the above identifiers are used, the global is externally
	visible, meaning that it participates in linkage and can be used to
	resolve external symbol references.

	It is illegal for a function declaration to have any linkage type
	other than ``external`` or ``extern_weak``.

	.. _callingconv:

	Calling Conventions
	-------------------

	LLVM :ref:`functions <functionstructure>`, :ref:`calls <i_call>` and
	:ref:`invokes <i_invoke>` can all have an optional calling convention
	specified for the call. The calling convention of any pair of dynamic
	caller/callee must match, or the behavior of the program is undefined.
	The following calling conventions are supported by LLVM, and more may be
	added in the future:

	"``ccc``" - The C calling convention
	This calling convention (the default if no other calling convention
	is specified) matches the target C calling conventions. This calling
	convention supports varargs function calls and tolerates some
	mismatch in the declared prototype and implemented declaration of
	the function (as does normal C).
	"``fastcc``" - The fast calling convention
	This calling convention attempts to make calls as fast as possible
	(e.g. by passing things in registers). This calling convention
	allows the target to use whatever tricks it wants to produce fast
	code for the target, without having to conform to an externally
	specified ABI (Application Binary Interface). `Tail calls can only
	be optimized when this, the GHC or the HiPE convention is
	used. <CodeGenerator.html#id80>`_ This calling convention does not
	support varargs and requires the prototype of all callees to exactly
	match the prototype of the function definition.
	"``coldcc``" - The cold calling convention
	This calling convention attempts to make code in the caller as
	efficient as possible under the assumption that the call is not
	commonly executed. As such, these calls often preserve all registers
	so that the call does not break any live ranges in the caller side.
	This calling convention does not support varargs and requires the
	prototype of all callees to exactly match the prototype of the
	function definition. Furthermore the inliner doesn't consider such function
	calls for inlining.
	"``cc 10``" - GHC convention
	This calling convention has been implemented specifically for use by
	the `Glasgow Haskell Compiler (GHC) <http://www.haskell.org/ghc>`_.
	It passes everything in registers, going to extremes to achieve this
	by disabling callee save registers. This calling convention should
	not be used lightly but only for specific situations such as an
	alternative to the register pinning performance technique often
	used when implementing functional programming languages. At the
	moment only X86 supports this convention and it has the following
	limitations:

	- On X86-32 only supports up to 4 bit type parameters. No
	floating point types are supported.
	- On X86-64 only supports up to 10 bit type parameters and 6
	floating point parameters.

	This calling convention supports `tail call
	optimization <CodeGenerator.html#id80>`_ but requires both the
	caller and callee are using it.
	"``cc 11``" - The HiPE calling convention
	This calling convention has been implemented specifically for use by
	the `High-Performance Erlang
	(HiPE) <http://www.it.uu.se/research/group/hipe/>`_ compiler, the
	native code compiler of the `Ericsson's Open Source Erlang/OTP
	system <http://www.erlang.org/download.shtml>`_. It uses more
	registers for argument passing than the ordinary C calling
	convention and defines no callee-saved registers. The calling
	convention properly supports `tail call
	optimization <CodeGenerator.html#id80>`_ but requires that both the
	caller and the callee use it. It uses a register pinning
	mechanism, similar to GHC's convention, for keeping frequently
	accessed runtime components pinned to specific hardware registers.
	At the moment only X86 supports this convention (both 32 and 64
	bit).
	"``webkit_jscc``" - WebKit's JavaScript calling convention
	This calling convention has been implemented for `WebKit FTL JIT
	<https://trac.webkit.org/wiki/FTLJIT>`_. It passes arguments on the
	stack right to left (as cdecl does), and returns a value in the
	platform's customary return register.
	"``anyregcc``" - Dynamic calling convention for code patching
	This is a special convention that supports patching an arbitrary code
	sequence in place of a call site. This convention forces the call
	arguments into registers but allows them to be dynamically
	allocated. This can currently only be used with calls to
	llvm.experimental.patchpoint because only this intrinsic records
	the location of its arguments in a side table. See :doc:`StackMaps`.
	"``preserve_mostcc``" - The `PreserveMost` calling convention
	This calling convention attempts to make the code in the caller as
	unintrusive as possible. This convention behaves identically to the `C`
	calling convention on how arguments and return values are passed, but it
	uses a different set of caller/callee-saved registers. This alleviates the
	burden of saving and recovering a large register set before and after the
	call in the caller. If the arguments are passed in callee-saved registers,
	then they will be preserved by the callee across the call. This doesn't
	apply for values returned in callee-saved registers.

	- On X86-64 the callee preserves all general purpose registers, except for
	R11. R11 can be used as a scratch register. Floating-point registers
	(XMMs/YMMs) are not preserved and need to be saved by the caller.

	The idea behind this convention is to support calls to runtime functions
	that have a hot path and a cold path. The hot path is usually a small piece
	of code that doesn't use many registers. The cold path might need to call out to
	another function and therefore only needs to preserve the caller-saved
	registers, which haven't already been saved by the caller. The
	`PreserveMost` calling convention is very similar to the `cold` calling
	convention in terms of caller/callee-saved registers, but they are used for
	different types of function calls. `coldcc` is for function calls that are
	rarely executed, whereas `preserve_mostcc` function calls are intended to be
	on the hot path and definitely executed a lot. Furthermore `preserve_mostcc`
	doesn't prevent the inliner from inlining the function call.

	This calling convention will be used by a future version of the ObjectiveC
	runtime and should therefore still be considered experimental at this time.
	Although this convention was created to optimize certain runtime calls to
	the ObjectiveC runtime, it is not limited to this runtime and might be used
	by other runtimes in the future too. The current implementation only
	supports X86-64, but the intention is to support more architectures in the
	future.
	"``preserve_allcc``" - The `PreserveAll` calling convention
	This calling convention attempts to make the code in the caller even less
	intrusive than the `PreserveMost` calling convention. This calling
	convention also behaves identical to the `C` calling convention on how
	arguments and return values are passed, but it uses a different set of
	caller/callee-saved registers. This removes the burden of saving and
	recovering a large register set before and after the call in the caller. If
	the arguments are passed in callee-saved registers, then they will be
	preserved by the callee across the call. This doesn't apply for values
	returned in callee-saved registers.

	- On X86-64 the callee preserves all general purpose registers, except for
	R11. R11 can be used as a scratch register. Furthermore it also preserves
	all floating-point registers (XMMs/YMMs).

	The idea behind this convention is to support calls to runtime functions
	that don't need to call out to any other functions.

	This calling convention, like the `PreserveMost` calling convention, will be
	used by a future version of the ObjectiveC runtime and should be considered
	experimental at this time.
	"``cxx_fast_tlscc``" - The `CXX_FAST_TLS` calling convention for access functions
	Clang generates an access function to access C++-style TLS. The access
	function generally has an entry block, an exit block and an initialization
	block that is run at the first time. The entry and exit blocks can access
	a few TLS IR variables, each access will be lowered to a platform-specific
	sequence.

	This calling convention aims to minimize overhead in the caller by
	preserving as many registers as possible (all the registers that are
	perserved on the fast path, composed of the entry and exit blocks).

	This calling convention behaves identical to the `C` calling convention on
	how arguments and return values are passed, but it uses a different set of
	caller/callee-saved registers.

	Given that each platform has its own lowering sequence, hence its own set
	of preserved registers, we can't use the existing `PreserveMost`.

	- On X86-64 the callee preserves all general purpose registers, except for
	RDI and RAX.
	"``swiftcc``" - This calling convention is used for Swift language.
	- On X86-64 RCX and R8 are available for additional integer returns, and
	XMM2 and XMM3 are available for additional FP/vector returns.
	- On iOS platforms, we use AAPCS-VFP calling convention.
	"``cc <n>``" - Numbered convention
	Any calling convention may be specified by number, allowing
	target-specific calling conventions to be used. Target specific
	calling conventions start at 64.

	More calling conventions can be added/defined on an as-needed basis, to
	support Pascal conventions or any other well-known target-independent
	convention.

	.. _visibilitystyles:

	Visibility Styles
	-----------------

	All Global Variables and Functions have one of the following visibility
	styles:

	"``default``" - Default style
	On targets that use the ELF object file format, default visibility
	means that the declaration is visible to other modules and, in
	shared libraries, means that the declared entity may be overridden.
	On Darwin, default visibility means that the declaration is visible
	to other modules. Default visibility corresponds to "external
	linkage" in the language.
	"``hidden``" - Hidden style
	Two declarations of an object with hidden visibility refer to the
	same object if they are in the same shared object. Usually, hidden
	visibility indicates that the symbol will not be placed into the
	dynamic symbol table, so no other module (executable or shared
	library) can reference it directly.
	"``protected``" - Protected style
	On ELF, protected visibility indicates that the symbol will be
	placed in the dynamic symbol table, but that references within the
	defining module will bind to the local symbol. That is, the symbol
	cannot be overridden by another module.

	A symbol with ``internal`` or ``private`` linkage must have ``default``
	visibility.

	.. _dllstorageclass:

	DLL Storage Classes
	-------------------

	All Global Variables, Functions and Aliases can have one of the following
	DLL storage class:

	``dllimport``
	"``dllimport``" causes the compiler to reference a function or variable via
	a global pointer to a pointer that is set up by the DLL exporting the
	symbol. On Microsoft Windows targets, the pointer name is formed by
	combining ``__imp_`` and the function or variable name.
	``dllexport``
	"``dllexport``" causes the compiler to provide a global pointer to a pointer
	in a DLL, so that it can be referenced with the ``dllimport`` attribute. On
	Microsoft Windows targets, the pointer name is formed by combining
	``__imp_`` and the function or variable name. Since this storage class
	exists for defining a dll interface, the compiler, assembler and linker know
	it is externally referenced and must refrain from deleting the symbol.

	.. _tls_model:

	Thread Local Storage Models
	---------------------------

	A variable may be defined as ``thread_local``, which means that it will
	not be shared by threads (each thread will have a separated copy of the
	variable). Not all targets support thread-local variables. Optionally, a
	TLS model may be specified:

	``localdynamic``
	For variables that are only used within the current shared library.
	``initialexec``
	For variables in modules that will not be loaded dynamically.
	``localexec``
	For variables defined in the executable and only used within it.

	If no explicit model is given, the "general dynamic" model is used.

	The models correspond to the ELF TLS models; see `ELF Handling For
	Thread-Local Storage <http://people.redhat.com/drepper/tls.pdf>`_ for
	more information on under which circumstances the different models may
	be used. The target may choose a different TLS model if the specified
	model is not supported, or if a better choice of model can be made.

	A model can also be specified in an alias, but then it only governs how
	the alias is accessed. It will not have any effect in the aliasee.

	For platforms without linker support of ELF TLS model, the -femulated-tls
	flag can be used to generate GCC compatible emulated TLS code.

	.. _namedtypes:

	Structure Types
	---------------

	LLVM IR allows you to specify both "identified" and "literal" :ref:`structure
	types <t_struct>`. Literal types are uniqued structurally, but identified types
	are never uniqued. An :ref:`opaque structural type <t_opaque>` can also be used
	to forward declare a type that is not yet available.

	An example of an identified structure specification is:

	.. code-block:: llvm

	%mytype = type { %mytype*, i32 }

	Prior to the LLVM 3.0 release, identified types were structurally uniqued. Only
	literal types are uniqued in recent versions of LLVM.

	.. _nointptrtype:

	Non-Integral Pointer Type
	-------------------------

	Note: non-integral pointer types are a work in progress, and they should be
	considered experimental at this time.

	LLVM IR optionally allows the frontend to denote pointers in certain address
	spaces as "non-integral" via the :ref:`datalayout string<langref_datalayout>`.
	Non-integral pointer types represent pointers that have an unspecified bitwise
	representation; that is, the integral representation may be target dependent or
	unstable (not backed by a fixed integer).

	``inttoptr`` instructions converting integers to non-integral pointer types are
	ill-typed, and so are ``ptrtoint`` instructions converting values of
	non-integral pointer types to integers. Vector versions of said instructions
	are ill-typed as well.

	.. _globalvars:

	Global Variables
	----------------

	Global variables define regions of memory allocated at compilation time
	instead of run-time.

	Global variable definitions must be initialized.

	Global variables in other translation units can also be declared, in which
	case they don't have an initializer.

	Either global variable definitions or declarations may have an explicit section
	to be placed in and may have an optional explicit alignment specified.

	A variable may be defined as a global ``constant``, which indicates that
	the contents of the variable will never be modified (enabling better
	optimization, allowing the global data to be placed in the read-only
	section of an executable, etc). Note that variables that need runtime
	initialization cannot be marked ``constant`` as there is a store to the
	variable.

	LLVM explicitly allows declarations of global variables to be marked
	constant, even if the final definition of the global is not. This
	capability can be used to enable slightly better optimization of the
	program, but requires the language definition to guarantee that
	optimizations based on the 'constantness' are valid for the translation
	units that do not include the definition.

	As SSA values, global variables define pointer values that are in scope
	(i.e. they dominate) all basic blocks in the program. Global variables
	always define a pointer to their "content" type because they describe a
	region of memory, and all memory objects in LLVM are accessed through
	pointers.

	Global variables can be marked with ``unnamed_addr`` which indicates
	that the address is not significant, only the content. Constants marked
	like this can be merged with other constants if they have the same
	initializer. Note that a constant with significant address can be
	merged with a ``unnamed_addr`` constant, the result being a constant
	whose address is significant.

	If the ``local_unnamed_addr`` attribute is given, the address is known to
	not be significant within the module.

	A global variable may be declared to reside in a target-specific
	numbered address space. For targets that support them, address spaces
	may affect how optimizations are performed and/or what target
	instructions are used to access the variable. The default address space
	is zero. The address space qualifier must precede any other attributes.

	LLVM allows an explicit section to be specified for globals. If the
	target supports it, it will emit globals to the section specified.
	Additionally, the global can placed in a comdat if the target has the necessary
	support.

	By default, global initializers are optimized by assuming that global
	variables defined within the module are not modified from their
	initial values before the start of the global initializer. This is
	true even for variables potentially accessible from outside the
	module, including those with external linkage or appearing in
	``@llvm.used`` or dllexported variables. This assumption may be suppressed
	by marking the variable with ``externally_initialized``.

	An explicit alignment may be specified for a global, which must be a
	power of 2. If not present, or if the alignment is set to zero, the
	alignment of the global is set by the target to whatever it feels
	convenient. If an explicit alignment is specified, the global is forced
	to have exactly that alignment. Targets and optimizers are not allowed
	to over-align the global if the global has an assigned section. In this
	case, the extra alignment could be observable: for example, code could
	assume that the globals are densely packed in their section and try to
	iterate over them as an array, alignment padding would break this
	iteration. The maximum alignment is ``1 << 29``.

	Globals can also have a :ref:`DLL storage class <dllstorageclass>`,
	an optional :ref:`global attributes <glattrs>` and
	an optional list of attached :ref:`metadata <metadata>`.

	Variables and aliases can have a
	:ref:`Thread Local Storage Model <tls_model>`.

	Syntax::

	@<GlobalVarName> = [Linkage] [Visibility] [DLLStorageClass] [ThreadLocal]
	[(unnamed_addr\|local_unnamed_addr)] [AddrSpace]
	[ExternallyInitialized]
	<global \| constant> <Type> [<InitializerConstant>]
	[, section "name"] [, comdat [($name)]]
	[, align <Alignment>] (, !name !N)*

	For example, the following defines a global in a numbered address space
	with an initializer, section, and alignment:

	.. code-block:: llvm

	@G = addrspace(5) constant float 1.0, section "foo", align 4

	The following example just declares a global variable

	.. code-block:: llvm

	@G = external global i32

	The following example defines a thread-local global with the
	``initialexec`` TLS model:

	.. code-block:: llvm

	@G = thread_local(initialexec) global i32 0, align 4

	.. _functionstructure:

	Functions
	---------

	LLVM function definitions consist of the "``define``" keyword, an
	optional :ref:`linkage type <linkage>`, an optional :ref:`visibility
	style <visibility>`, an optional :ref:`DLL storage class <dllstorageclass>`,
	an optional :ref:`calling convention <callingconv>`,
	an optional ``unnamed_addr`` attribute, a return type, an optional
	:ref:`parameter attribute <paramattrs>` for the return type, a function
	name, a (possibly empty) argument list (each with optional :ref:`parameter
	attributes <paramattrs>`), optional :ref:`function attributes <fnattrs>`,
	an optional section, an optional alignment,
	an optional :ref:`comdat <langref_comdats>`,
	an optional :ref:`garbage collector name <gc>`, an optional :ref:`prefix <prefixdata>`,
	an optional :ref:`prologue <prologuedata>`,
	an optional :ref:`personality <personalityfn>`,
	an optional list of attached :ref:`metadata <metadata>`,
	an opening curly brace, a list of basic blocks, and a closing curly brace.

	LLVM function declarations consist of the "``declare``" keyword, an
	optional :ref:`linkage type <linkage>`, an optional :ref:`visibility style
	<visibility>`, an optional :ref:`DLL storage class <dllstorageclass>`, an
	optional :ref:`calling convention <callingconv>`, an optional ``unnamed_addr``
	or ``local_unnamed_addr`` attribute, a return type, an optional :ref:`parameter
	attribute <paramattrs>` for the return type, a function name, a possibly
	empty list of arguments, an optional alignment, an optional :ref:`garbage
	collector name <gc>`, an optional :ref:`prefix <prefixdata>`, and an optional
	:ref:`prologue <prologuedata>`.

	A function definition contains a list of basic blocks, forming the CFG (Control
	Flow Graph) for the function. Each basic block may optionally start with a label
	(giving the basic block a symbol table entry), contains a list of instructions,
	and ends with a :ref:`terminator <terminators>` instruction (such as a branch or
	function return). If an explicit label is not provided, a block is assigned an
	implicit numbered label, using the next value from the same counter as used for
	unnamed temporaries (:ref:`see above<identifiers>`). For example, if a function
	entry block does not have an explicit label, it will be assigned label "%0",
	then the first unnamed temporary in that block will be "%1", etc.

	The first basic block in a function is special in two ways: it is
	immediately executed on entrance to the function, and it is not allowed
	to have predecessor basic blocks (i.e. there can not be any branches to
	the entry block of a function). Because the block can have no
	predecessors, it also cannot have any :ref:`PHI nodes <i_phi>`.

	LLVM allows an explicit section to be specified for functions. If the
	target supports it, it will emit functions to the section specified.
	Additionally, the function can be placed in a COMDAT.

	An explicit alignment may be specified for a function. If not present,
	or if the alignment is set to zero, the alignment of the function is set
	by the target to whatever it feels convenient. If an explicit alignment
	is specified, the function is forced to have at least that much
	alignment. All alignments must be a power of 2.

	If the ``unnamed_addr`` attribute is given, the address is known to not
	be significant and two identical functions can be merged.

	If the ``local_unnamed_addr`` attribute is given, the address is known to
	not be significant within the module.

	Syntax::

	define [linkage] [visibility] [DLLStorageClass]
	[cconv] [ret attrs]
	<ResultType> @<FunctionName> ([argument list])
	[(unnamed_addr\|local_unnamed_addr)] [fn Attrs] [section "name"]
	[comdat [($name)]] [align N] [gc] [prefix Constant]
	[prologue Constant] [personality Constant] (!name !N)* { ... }

	The argument list is a comma separated sequence of arguments where each
	argument is of the following form:

	Syntax::

	<type> [parameter Attrs] [name]


	.. _langref_aliases:

	Aliases
	-------

	Aliases, unlike function or variables, don't create any new data. They
	are just a new symbol and metadata for an existing position.

	Aliases have a name and an aliasee that is either a global value or a
	constant expression.

	Aliases may have an optional :ref:`linkage type <linkage>`, an optional
	:ref:`visibility style <visibility>`, an optional :ref:`DLL storage class
	<dllstorageclass>` and an optional :ref:`tls model <tls_model>`.

	Syntax::

	@<Name> = [Linkage] [Visibility] [DLLStorageClass] [ThreadLocal] [(unnamed_addr\|local_unnamed_addr)] alias <AliaseeTy>, <AliaseeTy>* @<Aliasee>

	The linkage must be one of ``private``, ``internal``, ``linkonce``, ``weak``,
	``linkonce_odr``, ``weak_odr``, ``external``. Note that some system linkers
	might not correctly handle dropping a weak symbol that is aliased.

	Aliases that are not ``unnamed_addr`` are guaranteed to have the same address as
	the aliasee expression. ``unnamed_addr`` ones are only guaranteed to point
	to the same content.

	If the ``local_unnamed_addr`` attribute is given, the address is known to
	not be significant within the module.

	Since aliases are only a second name, some restrictions apply, of which
	some can only be checked when producing an object file:

	* The expression defining the aliasee must be computable at assembly
	time. Since it is just a name, no relocations can be used.

	* No alias in the expression can be weak as the possibility of the
	intermediate alias being overridden cannot be represented in an
	object file.

	* No global value in the expression can be a declaration, since that
	would require a relocation, which is not possible.

	.. _langref_ifunc:

	IFuncs
	-------

	IFuncs, like as aliases, don't create any new data or func. They are just a new
	symbol that dynamic linker resolves at runtime by calling a resolver function.

	IFuncs have a name and a resolver that is a function called by dynamic linker
	that returns address of another function associated with the name.

	IFunc may have an optional :ref:`linkage type <linkage>` and an optional
	:ref:`visibility style <visibility>`.

	Syntax::

	@<Name> = [Linkage] [Visibility] ifunc <IFuncTy>, <ResolverTy>* @<Resolver>


	.. _langref_comdats:

	Comdats
	-------

	Comdat IR provides access to COFF and ELF object file COMDAT functionality.

	Comdats have a name which represents the COMDAT key. All global objects that
	specify this key will only end up in the final object file if the linker chooses
	that key over some other key. Aliases are placed in the same COMDAT that their
	aliasee computes to, if any.

	Comdats have a selection kind to provide input on how the linker should
	choose between keys in two different object files.

	Syntax::

	$<Name> = comdat SelectionKind

	The selection kind must be one of the following:

	``any``
	The linker may choose any COMDAT key, the choice is arbitrary.
	``exactmatch``
	The linker may choose any COMDAT key but the sections must contain the
	same data.
	``largest``
	The linker will choose the section containing the largest COMDAT key.
	``noduplicates``
	The linker requires that only section with this COMDAT key exist.
	``samesize``
	The linker may choose any COMDAT key but the sections must contain the
	same amount of data.

	Note that the Mach-O platform doesn't support COMDATs and ELF only supports
	``any`` as a selection kind.

	Here is an example of a COMDAT group where a function will only be selected if
	the COMDAT key's section is the largest:

	.. code-block:: text

	$foo = comdat largest
	@foo = global i32 2, comdat($foo)

	define void @bar() comdat($foo) {
	ret void
	}

	As a syntactic sugar the ``$name`` can be omitted if the name is the same as
	the global name:

	.. code-block:: text

	$foo = comdat any
	@foo = global i32 2, comdat


	In a COFF object file, this will create a COMDAT section with selection kind
	``IMAGE_COMDAT_SELECT_LARGEST`` containing the contents of the ``@foo`` symbol
	and another COMDAT section with selection kind
	``IMAGE_COMDAT_SELECT_ASSOCIATIVE`` which is associated with the first COMDAT
	section and contains the contents of the ``@bar`` symbol.

	There are some restrictions on the properties of the global object.
	It, or an alias to it, must have the same name as the COMDAT group when
	targeting COFF.
	The contents and size of this object may be used during link-time to determine
	which COMDAT groups get selected depending on the selection kind.
	Because the name of the object must match the name of the COMDAT group, the
	linkage of the global object must not be local; local symbols can get renamed
	if a collision occurs in the symbol table.

	The combined use of COMDATS and section attributes may yield surprising results.
	For example:

	.. code-block:: text

	$foo = comdat any
	$bar = comdat any
	@g1 = global i32 42, section "sec", comdat($foo)
	@g2 = global i32 42, section "sec", comdat($bar)

	From the object file perspective, this requires the creation of two sections
	with the same name. This is necessary because both globals belong to different
	COMDAT groups and COMDATs, at the object file level, are represented by
	sections.

	Note that certain IR constructs like global variables and functions may
	create COMDATs in the object file in addition to any which are specified using
	COMDAT IR. This arises when the code generator is configured to emit globals
	in individual sections (e.g. when `-data-sections` or `-function-sections`
	is supplied to `llc`).

	.. _namedmetadatastructure:

	Named Metadata
	--------------

	Named metadata is a collection of metadata. :ref:`Metadata
	nodes <metadata>` (but not metadata strings) are the only valid
	operands for a named metadata.

	#. Named metadata are represented as a string of characters with the
	metadata prefix. The rules for metadata names are the same as for
	identifiers, but quoted names are not allowed. ``"\xx"`` type escapes
	are still valid, which allows any character to be part of a name.

	Syntax::

	; Some unnamed metadata nodes, which are referenced by the named metadata.
	!0 = !{!"zero"}
	!1 = !{!"one"}
	!2 = !{!"two"}
	; A named metadata.
	!name = !{!0, !1, !2}

	.. _paramattrs:

	Parameter Attributes
	--------------------

	The return type and each parameter of a function type may have a set of
	parameter attributes associated with them. Parameter attributes are
	used to communicate additional information about the result or
	parameters of a function. Parameter attributes are considered to be part
	of the function, not of the function type, so functions with different
	parameter attributes can have the same function type.

	Parameter attributes are simple keywords that follow the type specified.
	If multiple parameter attributes are needed, they are space separated.
	For example:

	.. code-block:: llvm

	declare i32 @printf(i8* noalias nocapture, ...)
	declare i32 @atoi(i8 zeroext)
	declare signext i8 @returns_signed_char()

	Note that any attributes for the function result (``nounwind``,
	``readonly``) come immediately after the argument list.

	Currently, only the following parameter attributes are defined:

	``zeroext``
	This indicates to the code generator that the parameter or return
	value should be zero-extended to the extent required by the target's
	ABI by the caller (for a parameter) or the callee (for a return value).
	``signext``
	This indicates to the code generator that the parameter or return
	value should be sign-extended to the extent required by the target's
	ABI (which is usually 32-bits) by the caller (for a parameter) or
	the callee (for a return value).
	``inreg``
	This indicates that this parameter or return value should be treated
	in a special target-dependent fashion while emitting code for
	a function call or return (usually, by putting it in a register as
	opposed to memory, though some targets use it to distinguish between
	two different kinds of registers). Use of this attribute is
	target-specific.
	``byval``
	This indicates that the pointer parameter should really be passed by
	value to the function. The attribute implies that a hidden copy of
	the pointee is made between the caller and the callee, so the callee
	is unable to modify the value in the caller. This attribute is only
	valid on LLVM pointer arguments. It is generally used to pass
	structs and arrays by value, but is also valid on pointers to
	scalars. The copy is considered to belong to the caller not the
	callee (for example, ``readonly`` functions should not write to
	``byval`` parameters). This is not a valid attribute for return
	values.

	The byval attribute also supports specifying an alignment with the
	align attribute. It indicates the alignment of the stack slot to
	form and the known alignment of the pointer specified to the call
	site. If the alignment is not specified, then the code generator
	makes a target-specific assumption.

	.. _attr_inalloca:

	``inalloca``

	The ``inalloca`` argument attribute allows the caller to take the
	address of outgoing stack arguments. An ``inalloca`` argument must
	be a pointer to stack memory produced by an ``alloca`` instruction.
	The alloca, or argument allocation, must also be tagged with the
	inalloca keyword. Only the last argument may have the ``inalloca``
	attribute, and that argument is guaranteed to be passed in memory.

	An argument allocation may be used by a call at most once because
	the call may deallocate it. The ``inalloca`` attribute cannot be
	used in conjunction with other attributes that affect argument
	storage, like ``inreg``, ``nest``, ``sret``, or ``byval``. The
	``inalloca`` attribute also disables LLVM's implicit lowering of
	large aggregate return values, which means that frontend authors
	must lower them with ``sret`` pointers.

	When the call site is reached, the argument allocation must have
	been the most recent stack allocation that is still live, or the
	results are undefined. It is possible to allocate additional stack
	space after an argument allocation and before its call site, but it
	must be cleared off with :ref:`llvm.stackrestore
	<int_stackrestore>`.

	See :doc:`InAlloca` for more information on how to use this
	attribute.

	``sret``
	This indicates that the pointer parameter specifies the address of a
	structure that is the return value of the function in the source
	program. This pointer must be guaranteed by the caller to be valid:
	loads and stores to the structure may be assumed by the callee not
	to trap and to be properly aligned. This is not a valid attribute
	for return values.

	``align <n>``
	This indicates that the pointer value may be assumed by the optimizer to
	have the specified alignment.

	Note that this attribute has additional semantics when combined with the
	``byval`` attribute.

	.. _noalias:

	``noalias``
	This indicates that objects accessed via pointer values
	:ref:`based <pointeraliasing>` on the argument or return value are not also
	accessed, during the execution of the function, via pointer values not
	based on the argument or return value. The attribute on a return value
	also has additional semantics described below. The caller shares the
	responsibility with the callee for ensuring that these requirements are met.
	For further details, please see the discussion of the NoAlias response in
	:ref:`alias analysis <Must, May, or No>`.

	Note that this definition of ``noalias`` is intentionally similar
	to the definition of ``restrict`` in C99 for function arguments.

	For function return values, C99's ``restrict`` is not meaningful,
	while LLVM's ``noalias`` is. Furthermore, the semantics of the ``noalias``
	attribute on return values are stronger than the semantics of the attribute
	when used on function arguments. On function return values, the ``noalias``
	attribute indicates that the function acts like a system memory allocation
	function, returning a pointer to allocated storage disjoint from the
	storage for any other object accessible to the caller.

	``nocapture``
	This indicates that the callee does not make any copies of the
	pointer that outlive the callee itself. This is not a valid
	attribute for return values. Addresses used in volatile operations
	are considered to be captured.

	.. _nest:

	``nest``
	This indicates that the pointer parameter can be excised using the
	:ref:`trampoline intrinsics <int_trampoline>`. This is not a valid
	attribute for return values and can only be applied to one parameter.

	``returned``
	This indicates that the function always returns the argument as its return
	value. This is a hint to the optimizer and code generator used when
	generating the caller, allowing value propagation, tail call optimization,
	and omission of register saves and restores in some cases; it is not
	checked or enforced when generating the callee. The parameter and the
	function return type must be valid operands for the
	:ref:`bitcast instruction <i_bitcast>`. This is not a valid attribute for
	return values and can only be applied to one parameter.

	``nonnull``
	This indicates that the parameter or return pointer is not null. This
	attribute may only be applied to pointer typed parameters. This is not
	checked or enforced by LLVM, the caller must ensure that the pointer
	passed in is non-null, or the callee must ensure that the returned pointer
	is non-null.

	``dereferenceable(<n>)``
	This indicates that the parameter or return pointer is dereferenceable. This
	attribute may only be applied to pointer typed parameters. A pointer that
	is dereferenceable can be loaded from speculatively without a risk of
	trapping. The number of bytes known to be dereferenceable must be provided
	in parentheses. It is legal for the number of bytes to be less than the
	size of the pointee type. The ``nonnull`` attribute does not imply
	dereferenceability (consider a pointer to one element past the end of an
	array), however ``dereferenceable(<n>)`` does imply ``nonnull`` in
	``addrspace(0)`` (which is the default address space).

	``dereferenceable_or_null(<n>)``
	This indicates that the parameter or return value isn't both
	non-null and non-dereferenceable (up to ``<n>`` bytes) at the same
	time. All non-null pointers tagged with
	``dereferenceable_or_null(<n>)`` are ``dereferenceable(<n>)``.
	For address space 0 ``dereferenceable_or_null(<n>)`` implies that
	a pointer is exactly one of ``dereferenceable(<n>)`` or ``null``,
	and in other address spaces ``dereferenceable_or_null(<n>)``
	implies that a pointer is at least one of ``dereferenceable(<n>)``
	or ``null`` (i.e. it may be both ``null`` and
	``dereferenceable(<n>)``). This attribute may only be applied to
	pointer typed parameters.

	``swiftself``
	This indicates that the parameter is the self/context parameter. This is not
	a valid attribute for return values and can only be applied to one
	parameter.

	``swifterror``
	This attribute is motivated to model and optimize Swift error handling. It
	can be applied to a parameter with pointer to pointer type or a
	pointer-sized alloca. At the call site, the actual argument that corresponds
	to a ``swifterror`` parameter has to come from a ``swifterror`` alloca or
	the ``swifterror`` parameter of the caller. A ``swifterror`` value (either
	the parameter or the alloca) can only be loaded and stored from, or used as
	a ``swifterror`` argument. This is not a valid attribute for return values
	and can only be applied to one parameter.

	These constraints allow the calling convention to optimize access to
	``swifterror`` variables by associating them with a specific register at
	call boundaries rather than placing them in memory. Since this does change
	the calling convention, a function which uses the ``swifterror`` attribute
	on a parameter is not ABI-compatible with one which does not.

	These constraints also allow LLVM to assume that a ``swifterror`` argument
	does not alias any other memory visible within a function and that a
	``swifterror`` alloca passed as an argument does not escape.

	.. _gc:

	Garbage Collector Strategy Names
	--------------------------------

	Each function may specify a garbage collector strategy name, which is simply a
	string:

	.. code-block:: llvm

	define void @f() gc "name" { ... }

	The supported values of name includes those :ref:`built in to LLVM
	<builtin-gc-strategies>` and any provided by loaded plugins. Specifying a GC
	strategy will cause the compiler to alter its output in order to support the
	named garbage collection algorithm. Note that LLVM itself does not contain a
	garbage collector, this functionality is restricted to generating machine code
	which can interoperate with a collector provided externally.

	.. _prefixdata:

	Prefix Data
	-----------

	Prefix data is data associated with a function which the code
	generator will emit immediately before the function's entrypoint.
	The purpose of this feature is to allow frontends to associate
	language-specific runtime metadata with specific functions and make it
	available through the function pointer while still allowing the
	function pointer to be called.

	To access the data for a given function, a program may bitcast the
	function pointer to a pointer to the constant's type and dereference
	index -1. This implies that the IR symbol points just past the end of
	the prefix data. For instance, take the example of a function annotated
	with a single ``i32``,

	.. code-block:: llvm

	define void @f() prefix i32 123 { ... }

	The prefix data can be referenced as,

	.. code-block:: llvm

	%0 = bitcast void* () @f to i32*
	%a = getelementptr inbounds i32, i32* %0, i32 -1
	%b = load i32, i32* %a

	Prefix data is laid out as if it were an initializer for a global variable
	of the prefix data's type. The function will be placed such that the
	beginning of the prefix data is aligned. This means that if the size
	of the prefix data is not a multiple of the alignment size, the
	function's entrypoint will not be aligned. If alignment of the
	function's entrypoint is desired, padding must be added to the prefix
	data.

	A function may have prefix data but no body. This has similar semantics
	to the ``available_externally`` linkage in that the data may be used by the
	optimizers but will not be emitted in the object file.

	.. _prologuedata:

	Prologue Data
	-------------

	The ``prologue`` attribute allows arbitrary code (encoded as bytes) to
	be inserted prior to the function body. This can be used for enabling
	function hot-patching and instrumentation.

	To maintain the semantics of ordinary function calls, the prologue data must
	have a particular format. Specifically, it must begin with a sequence of
	bytes which decode to a sequence of machine instructions, valid for the
	module's target, which transfer control to the point immediately succeeding
	the prologue data, without performing any other visible action. This allows
	the inliner and other passes to reason about the semantics of the function
	definition without needing to reason about the prologue data. Obviously this
	makes the format of the prologue data highly target dependent.

	A trivial example of valid prologue data for the x86 architecture is ``i8 144``,
	which encodes the ``nop`` instruction:

	.. code-block:: text

	define void @f() prologue i8 144 { ... }

	Generally prologue data can be formed by encoding a relative branch instruction
	which skips the metadata, as in this example of valid prologue data for the
	x86_64 architecture, where the first two bytes encode ``jmp .+10``:

	.. code-block:: text

	%0 = type <{ i8, i8, i8* }>

	define void @f() prologue %0 <{ i8 235, i8 8, i8* @md}> { ... }

	A function may have prologue data but no body. This has similar semantics
	to the ``available_externally`` linkage in that the data may be used by the
	optimizers but will not be emitted in the object file.

	.. _personalityfn:

	Personality Function
	--------------------

	The ``personality`` attribute permits functions to specify what function
	to use for exception handling.

	.. _attrgrp:

	Attribute Groups
	----------------

	Attribute groups are groups of attributes that are referenced by objects within
	the IR. They are important for keeping ``.ll`` files readable, because a lot of
	functions will use the same set of attributes. In the degenerative case of a
	``.ll`` file that corresponds to a single ``.c`` file, the single attribute
	group will capture the important command line flags used to build that file.

	An attribute group is a module-level object. To use an attribute group, an
	object references the attribute group's ID (e.g. ``#37``). An object may refer
	to more than one attribute group. In that situation, the attributes from the
	different groups are merged.

	Here is an example of attribute groups for a function that should always be
	inlined, has a stack alignment of 4, and which shouldn't use SSE instructions:

	.. code-block:: llvm

	; Target-independent attributes:
	attributes #0 = { alwaysinline alignstack=4 }

	; Target-dependent attributes:
	attributes #1 = { "no-sse" }

	; Function @f has attributes: alwaysinline, alignstack=4, and "no-sse".
	define void @f() #0 #1 { ... }

	.. _fnattrs:

	Function Attributes
	-------------------

	Function attributes are set to communicate additional information about
	a function. Function attributes are considered to be part of the
	function, not of the function type, so functions with different function
	attributes can have the same function type.

	Function attributes are simple keywords that follow the type specified.
	If multiple attributes are needed, they are space separated. For
	example:

	.. code-block:: llvm

	define void @f() noinline { ... }
	define void @f() alwaysinline { ... }
	define void @f() alwaysinline optsize { ... }
	define void @f() optsize { ... }

	``alignstack(<n>)``
	This attribute indicates that, when emitting the prologue and
	epilogue, the backend should forcibly align the stack pointer.
	Specify the desired alignment, which must be a power of two, in
	parentheses.
	``allocsize(<EltSizeParam>[, <NumEltsParam>])``
	This attribute indicates that the annotated function will always return at
	least a given number of bytes (or null). Its arguments are zero-indexed
	parameter numbers; if one argument is provided, then it's assumed that at
	least ``CallSite.Args[EltSizeParam]`` bytes will be available at the
	returned pointer. If two are provided, then it's assumed that
	``CallSite.Args[EltSizeParam] * CallSite.Args[NumEltsParam]`` bytes are
	available. The referenced parameters must be integer types. No assumptions
	are made about the contents of the returned block of memory.
	``alwaysinline``
	This attribute indicates that the inliner should attempt to inline
	this function into callers whenever possible, ignoring any active
	inlining size threshold for this caller.
	``builtin``
	This indicates that the callee function at a call site should be
	recognized as a built-in function, even though the function's declaration
	uses the ``nobuiltin`` attribute. This is only valid at call sites for
	direct calls to functions that are declared with the ``nobuiltin``
	attribute.
	``cold``
	This attribute indicates that this function is rarely called. When
	computing edge weights, basic blocks post-dominated by a cold
	function call are also considered to be cold; and, thus, given low
	weight.
	``convergent``
	In some parallel execution models, there exist operations that cannot be
	made control-dependent on any additional values. We call such operations
	``convergent``, and mark them with this attribute.

	The ``convergent`` attribute may appear on functions or call/invoke
	instructions. When it appears on a function, it indicates that calls to
	this function should not be made control-dependent on additional values.
	For example, the intrinsic ``llvm.nvvm.barrier0`` is ``convergent``, so
	calls to this intrinsic cannot be made control-dependent on additional
	values.

	When it appears on a call/invoke, the ``convergent`` attribute indicates
	that we should treat the call as though we're calling a convergent
	function. This is particularly useful on indirect calls; without this we
	may treat such calls as though the target is non-convergent.

	The optimizer may remove the ``convergent`` attribute on functions when it
	can prove that the function does not execute any convergent operations.
	Similarly, the optimizer may remove ``convergent`` on calls/invokes when it
	can prove that the call/invoke cannot call a convergent function.
	``inaccessiblememonly``
	This attribute indicates that the function may only access memory that
	is not accessible by the module being compiled. This is a weaker form
	of ``readnone``.
	``inaccessiblemem_or_argmemonly``
	This attribute indicates that the function may only access memory that is
	either not accessible by the module being compiled, or is pointed to
	by its pointer arguments. This is a weaker form of ``argmemonly``
	``inlinehint``
	This attribute indicates that the source code contained a hint that
	inlining this function is desirable (such as the "inline" keyword in
	C/C++). It is just a hint; it imposes no requirements on the
	inliner.
	``jumptable``
	This attribute indicates that the function should be added to a
	jump-instruction table at code-generation time, and that all address-taken
	references to this function should be replaced with a reference to the
	appropriate jump-instruction-table function pointer. Note that this creates
	a new pointer for the original function, which means that code that depends
	on function-pointer identity can break. So, any function annotated with
	``jumptable`` must also be ``unnamed_addr``.
	``minsize``
	This attribute suggests that optimization passes and code generator
	passes make choices that keep the code size of this function as small
	as possible and perform optimizations that may sacrifice runtime
	performance in order to minimize the size of the generated code.
	``naked``
	This attribute disables prologue / epilogue emission for the
	function. This can have very system-specific consequences.
	``nobuiltin``
	This indicates that the callee function at a call site is not recognized as
	a built-in function. LLVM will retain the original call and not replace it
	with equivalent code based on the semantics of the built-in function, unless
	the call site uses the ``builtin`` attribute. This is valid at call sites
	and on function declarations and definitions.
	``noduplicate``
	This attribute indicates that calls to the function cannot be
	duplicated. A call to a ``noduplicate`` function may be moved
	within its parent function, but may not be duplicated within
	its parent function.

	A function containing a ``noduplicate`` call may still
	be an inlining candidate, provided that the call is not
	duplicated by inlining. That implies that the function has
	internal linkage and only has one call site, so the original
	call is dead after inlining.
	``noimplicitfloat``
	This attributes disables implicit floating point instructions.
	``noinline``
	This attribute indicates that the inliner should never inline this
	function in any situation. This attribute may not be used together
	with the ``alwaysinline`` attribute.
	``nonlazybind``
	This attribute suppresses lazy symbol binding for the function. This
	may make calls to the function faster, at the cost of extra program
	startup time if the function is not called during program startup.
	``noredzone``
	This attribute indicates that the code generator should not use a
	red zone, even if the target-specific ABI normally permits it.
	``noreturn``
	This function attribute indicates that the function never returns
	normally. This produces undefined behavior at runtime if the
	function ever does dynamically return.
	``norecurse``
	This function attribute indicates that the function does not call itself
	either directly or indirectly down any possible call path. This produces
	undefined behavior at runtime if the function ever does recurse.
	``nounwind``
	This function attribute indicates that the function never raises an
	exception. If the function does raise an exception, its runtime
	behavior is undefined. However, functions marked nounwind may still
	trap or generate asynchronous exceptions. Exception handling schemes
	that are recognized by LLVM to handle asynchronous exceptions, such
	as SEH, will still provide their implementation defined semantics.
	``optnone``
	This function attribute indicates that most optimization passes will skip
	this function, with the exception of interprocedural optimization passes.
	Code generation defaults to the "fast" instruction selector.
	This attribute cannot be used together with the ``alwaysinline``
	attribute; this attribute is also incompatible
	with the ``minsize`` attribute and the ``optsize`` attribute.

	This attribute requires the ``noinline`` attribute to be specified on
	the function as well, so the function is never inlined into any caller.
	Only functions with the ``alwaysinline`` attribute are valid
	candidates for inlining into the body of this function.
	``optsize``
	This attribute suggests that optimization passes and code generator
	passes make choices that keep the code size of this function low,
	and otherwise do optimizations specifically to reduce code size as
	long as they do not significantly impact runtime performance.
	``"patchable-function"``
	This attribute tells the code generator that the code
	generated for this function needs to follow certain conventions that
	make it possible for a runtime function to patch over it later.
	The exact effect of this attribute depends on its string value,
	for which there currently is one legal possibility:

	* ``"prologue-short-redirect"`` - This style of patchable
	function is intended to support patching a function prologue to
	redirect control away from the function in a thread safe
	manner. It guarantees that the first instruction of the
	function will be large enough to accommodate a short jump
	instruction, and will be sufficiently aligned to allow being
	fully changed via an atomic compare-and-swap instruction.
	While the first requirement can be satisfied by inserting large
	enough NOP, LLVM can and will try to re-purpose an existing
	instruction (i.e. one that would have to be emitted anyway) as
	the patchable instruction larger than a short jump.

	``"prologue-short-redirect"`` is currently only supported on
	x86-64.

	This attribute by itself does not imply restrictions on
	inter-procedural optimizations. All of the semantic effects the
	patching may have to be separately conveyed via the linkage type.
	``"probe-stack"``
	This attribute indicates that the function will trigger a guard region
	in the end of the stack. It ensures that accesses to the stack must be
	no further apart than the size of the guard region to a previous
	access of the stack. It takes one required string value, the name of
	the stack probing function that will be called.

	If a function that has a ``"probe-stack"`` attribute is inlined into
	a function with another ``"probe-stack"`` attribute, the resulting
	function has the ``"probe-stack"`` attribute of the caller. If a
	function that has a ``"probe-stack"`` attribute is inlined into a
	function that has no ``"probe-stack"`` attribute at all, the resulting
	function has the ``"probe-stack"`` attribute of the callee.
	``readnone``
	On a function, this attribute indicates that the function computes its
	result (or decides to unwind an exception) based strictly on its arguments,
	without dereferencing any pointer arguments or otherwise accessing
	any mutable state (e.g. memory, control registers, etc) visible to
	caller functions. It does not write through any pointer arguments
	(including ``byval`` arguments) and never changes any state visible
	to callers. This means while it cannot unwind exceptions by calling
	the ``C++`` exception throwing methods (since they write to memory), there may
	be non-``C++`` mechanisms that throw exceptions without writing to LLVM
	visible memory.

	On an argument, this attribute indicates that the function does not
	dereference that pointer argument, even though it may read or write the
	memory that the pointer points to if accessed through other pointers.
	``readonly``
	On a function, this attribute indicates that the function does not write
	through any pointer arguments (including ``byval`` arguments) or otherwise
	modify any state (e.g. memory, control registers, etc) visible to
	caller functions. It may dereference pointer arguments and read
	state that may be set in the caller. A readonly function always
	returns the same value (or unwinds an exception identically) when
	called with the same set of arguments and global state. This means while it
	cannot unwind exceptions by calling the ``C++`` exception throwing methods
	(since they write to memory), there may be non-``C++`` mechanisms that throw
	exceptions without writing to LLVM visible memory.

	On an argument, this attribute indicates that the function does not write
	through this pointer argument, even though it may write to the memory that
	the pointer points to.
	``"stack-probe-size"``
	This attribute controls the behavior of stack probes: either
	the ``"probe-stack"`` attribute, or ABI-required stack probes, if any.
	It defines the size of the guard region. It ensures that if the function
	may use more stack space than the size of the guard region, stack probing
	sequence will be emitted. It takes one required integer value, which
	is 4096 by default.

	If a function that has a ``"stack-probe-size"`` attribute is inlined into
	a function with another ``"stack-probe-size"`` attribute, the resulting
	function has the ``"stack-probe-size"`` attribute that has the lower
	numeric value. If a function that has a ``"stack-probe-size"`` attribute is
	inlined into a function that has no ``"stack-probe-size"`` attribute
	at all, the resulting function has the ``"stack-probe-size"`` attribute
	of the callee.
	``writeonly``
	On a function, this attribute indicates that the function may write to but
	does not read from memory.

	On an argument, this attribute indicates that the function may write to but
	does not read through this pointer argument (even though it may read from
	the memory that the pointer points to).
	``argmemonly``
	This attribute indicates that the only memory accesses inside function are
	loads and stores from objects pointed to by its pointer-typed arguments,
	with arbitrary offsets. Or in other words, all memory operations in the
	function can refer to memory only using pointers based on its function
	arguments.
	Note that ``argmemonly`` can be used together with ``readonly`` attribute
	in order to specify that function reads only from its arguments.
	``returns_twice``
	This attribute indicates that this function can return twice. The C
	``setjmp`` is an example of such a function. The compiler disables
	some optimizations (like tail calls) in the caller of these
	functions.
	``safestack``
	This attribute indicates that
	`SafeStack <http://clang.llvm.org/docs/SafeStack.html>`_
	protection is enabled for this function.

	If a function that has a ``safestack`` attribute is inlined into a
	function that doesn't have a ``safestack`` attribute or which has an
	``ssp``, ``sspstrong`` or ``sspreq`` attribute, then the resulting
	function will have a ``safestack`` attribute.
	``sanitize_address``
	This attribute indicates that AddressSanitizer checks
	(dynamic address safety analysis) are enabled for this function.
	``sanitize_memory``
	This attribute indicates that MemorySanitizer checks (dynamic detection
	of accesses to uninitialized memory) are enabled for this function.
	``sanitize_thread``
	This attribute indicates that ThreadSanitizer checks
	(dynamic thread safety analysis) are enabled for this function.
	``speculatable``
	This function attribute indicates that the function does not have any
	effects besides calculating its result and does not have undefined behavior.
	Note that ``speculatable`` is not enough to conclude that along any
	particular execution path the number of calls to this function will not be
	externally observable. This attribute is only valid on functions
	and declarations, not on individual call sites. If a function is
	incorrectly marked as speculatable and really does exhibit
	undefined behavior, the undefined behavior may be observed even
	if the call site is dead code.

	``ssp``
	This attribute indicates that the function should emit a stack
	smashing protector. It is in the form of a "canary" --- a random value
	placed on the stack before the local variables that's checked upon
	return from the function to see if it has been overwritten. A
	heuristic is used to determine if a function needs stack protectors
	or not. The heuristic used will enable protectors for functions with:

	- Character arrays larger than ``ssp-buffer-size`` (default 8).
	- Aggregates containing character arrays larger than ``ssp-buffer-size``.
	- Calls to alloca() with variable sizes or constant sizes greater than
	``ssp-buffer-size``.

	Variables that are identified as requiring a protector will be arranged
	on the stack such that they are adjacent to the stack protector guard.

	If a function that has an ``ssp`` attribute is inlined into a
	function that doesn't have an ``ssp`` attribute, then the resulting
	function will have an ``ssp`` attribute.
	``sspreq``
	This attribute indicates that the function should always emit a
	stack smashing protector. This overrides the ``ssp`` function
	attribute.

	Variables that are identified as requiring a protector will be arranged
	on the stack such that they are adjacent to the stack protector guard.
	The specific layout rules are:

	#. Large arrays and structures containing large arrays
	(``>= ssp-buffer-size``) are closest to the stack protector.
	#. Small arrays and structures containing small arrays
	(``< ssp-buffer-size``) are 2nd closest to the protector.
	#. Variables that have had their address taken are 3rd closest to the
	protector.

	If a function that has an ``sspreq`` attribute is inlined into a
	function that doesn't have an ``sspreq`` attribute or which has an
	``ssp`` or ``sspstrong`` attribute, then the resulting function will have
	an ``sspreq`` attribute.
	``sspstrong``
	This attribute indicates that the function should emit a stack smashing
	protector. This attribute causes a strong heuristic to be used when
	determining if a function needs stack protectors. The strong heuristic
	will enable protectors for functions with:

	- Arrays of any size and type
	- Aggregates containing an array of any size and type.
	- Calls to alloca().
	- Local variables that have had their address taken.

	Variables that are identified as requiring a protector will be arranged
	on the stack such that they are adjacent to the stack protector guard.
	The specific layout rules are:

	#. Large arrays and structures containing large arrays
	(``>= ssp-buffer-size``) are closest to the stack protector.
	#. Small arrays and structures containing small arrays
	(``< ssp-buffer-size``) are 2nd closest to the protector.
	#. Variables that have had their address taken are 3rd closest to the
	protector.

	This overrides the ``ssp`` function attribute.

	If a function that has an ``sspstrong`` attribute is inlined into a
	function that doesn't have an ``sspstrong`` attribute, then the
	resulting function will have an ``sspstrong`` attribute.
	``"thunk"``
	This attribute indicates that the function will delegate to some other
	function with a tail call. The prototype of a thunk should not be used for
	optimization purposes. The caller is expected to cast the thunk prototype to
	match the thunk target prototype.
	``uwtable``
	This attribute indicates that the ABI being targeted requires that
	an unwind table entry be produced for this function even if we can
	show that no exceptions passes by it. This is normally the case for
	the ELF x86-64 abi, but it can be disabled for some compilation
	units.

	.. _glattrs:

	Global Attributes
	-----------------

	Attributes may be set to communicate additional information about a global variable.
	Unlike :ref:`function attributes <fnattrs>`, attributes on a global variable
	are grouped into a single :ref:`attribute group <attrgrp>`.

	.. _opbundles:

	Operand Bundles
	---------------

	Operand bundles are tagged sets of SSA values that can be associated
	with certain LLVM instructions (currently only ``call`` s and
	``invoke`` s). In a way they are like metadata, but dropping them is
	incorrect and will change program semantics.

	Syntax::

	operand bundle set ::= '[' operand bundle (, operand bundle )* ']'
	operand bundle ::= tag '(' [ bundle operand ] (, bundle operand )* ')'
	bundle operand ::= SSA value
	tag ::= string constant

	Operand bundles are not part of a function's signature, and a
	given function may be called from multiple places with different kinds
	of operand bundles. This reflects the fact that the operand bundles
	are conceptually a part of the ``call`` (or ``invoke``), not the
	callee being dispatched to.

	Operand bundles are a generic mechanism intended to support
	runtime-introspection-like functionality for managed languages. While
	the exact semantics of an operand bundle depend on the bundle tag,
	there are certain limitations to how much the presence of an operand
	bundle can influence the semantics of a program. These restrictions
	are described as the semantics of an "unknown" operand bundle. As
	long as the behavior of an operand bundle is describable within these
	restrictions, LLVM does not need to have special knowledge of the
	operand bundle to not miscompile programs containing it.

	- The bundle operands for an unknown operand bundle escape in unknown
	ways before control is transferred to the callee or invokee.
	- Calls and invokes with operand bundles have unknown read / write
	effect on the heap on entry and exit (even if the call target is
	``readnone`` or ``readonly``), unless they're overridden with
	callsite specific attributes.
	- An operand bundle at a call site cannot change the implementation
	of the called function. Inter-procedural optimizations work as
	usual as long as they take into account the first two properties.

	More specific types of operand bundles are described below.

	.. _deopt_opbundles:

	Deoptimization Operand Bundles
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Deoptimization operand bundles are characterized by the ``"deopt"``
	operand bundle tag. These operand bundles represent an alternate
	"safe" continuation for the call site they're attached to, and can be
	used by a suitable runtime to deoptimize the compiled frame at the
	specified call site. There can be at most one ``"deopt"`` operand
	bundle attached to a call site. Exact details of deoptimization is
	out of scope for the language reference, but it usually involves
	rewriting a compiled frame into a set of interpreted frames.

	From the compiler's perspective, deoptimization operand bundles make
	the call sites they're attached to at least ``readonly``. They read
	through all of their pointer typed operands (even if they're not
	otherwise escaped) and the entire visible heap. Deoptimization
	operand bundles do not capture their operands except during
	deoptimization, in which case control will not be returned to the
	compiled frame.

	The inliner knows how to inline through calls that have deoptimization
	operand bundles. Just like inlining through a normal call site
	involves composing the normal and exceptional continuations, inlining
	through a call site with a deoptimization operand bundle needs to
	appropriately compose the "safe" deoptimization continuation. The
	inliner does this by prepending the parent's deoptimization
	continuation to every deoptimization continuation in the inlined body.
	E.g. inlining ``@f`` into ``@g`` in the following example

	.. code-block:: llvm

	define void @f() {
	call void @x() ;; no deopt state
	call void @y() [ "deopt"(i32 10) ]
	call void @y() [ "deopt"(i32 10), "unknown"(i8* null) ]
	ret void
	}

	define void @g() {
	call void @f() [ "deopt"(i32 20) ]
	ret void
	}

	will result in

	.. code-block:: llvm

	define void @g() {
	call void @x() ;; still no deopt state
	call void @y() [ "deopt"(i32 20, i32 10) ]
	call void @y() [ "deopt"(i32 20, i32 10), "unknown"(i8* null) ]
	ret void
	}

	It is the frontend's responsibility to structure or encode the
	deoptimization state in a way that syntactically prepending the
	caller's deoptimization state to the callee's deoptimization state is
	semantically equivalent to composing the caller's deoptimization
	continuation after the callee's deoptimization continuation.

	.. _ob_funclet:

	Funclet Operand Bundles
	^^^^^^^^^^^^^^^^^^^^^^^

	Funclet operand bundles are characterized by the ``"funclet"``
	operand bundle tag. These operand bundles indicate that a call site
	is within a particular funclet. There can be at most one
	``"funclet"`` operand bundle attached to a call site and it must have
	exactly one bundle operand.

	If any funclet EH pads have been "entered" but not "exited" (per the
	`description in the EH doc\ <ExceptionHandling.html#wineh-constraints>`_),
	it is undefined behavior to execute a ``call`` or ``invoke`` which:

	* does not have a ``"funclet"`` bundle and is not a ``call`` to a nounwind
	intrinsic, or
	* has a ``"funclet"`` bundle whose operand is not the most-recently-entered
	not-yet-exited funclet EH pad.

	Similarly, if no funclet EH pads have been entered-but-not-yet-exited,
	executing a ``call`` or ``invoke`` with a ``"funclet"`` bundle is undefined behavior.

	GC Transition Operand Bundles
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	GC transition operand bundles are characterized by the
	``"gc-transition"`` operand bundle tag. These operand bundles mark a
	call as a transition between a function with one GC strategy to a
	function with a different GC strategy. If coordinating the transition
	between GC strategies requires additional code generation at the call
	site, these bundles may contain any values that are needed by the
	generated code. For more details, see :ref:`GC Transitions
	<gc_transition_args>`.

	.. _moduleasm:

	Module-Level Inline Assembly
	----------------------------

	Modules may contain "module-level inline asm" blocks, which corresponds
	to the GCC "file scope inline asm" blocks. These blocks are internally
	concatenated by LLVM and treated as a single unit, but may be separated
	in the ``.ll`` file if desired. The syntax is very simple:

	.. code-block:: llvm

	module asm "inline asm code goes here"
	module asm "more can go here"

	The strings can contain any character by escaping non-printable
	characters. The escape sequence used is simply "\\xx" where "xx" is the
	two digit hex code for the number.

	Note that the assembly string must be parseable by LLVM's integrated assembler
	(unless it is disabled), even when emitting a ``.s`` file.

	.. _langref_datalayout:

	Data Layout
	-----------

	A module may specify a target specific data layout string that specifies
	how data is to be laid out in memory. The syntax for the data layout is
	simply:

	.. code-block:: llvm

	target datalayout = "layout specification"

	The layout specification consists of a list of specifications
	separated by the minus sign character ('-'). Each specification starts
	with a letter and may include other information after the letter to
	define some aspect of the data layout. The specifications accepted are
	as follows:

	``E``
	Specifies that the target lays out data in big-endian form. That is,
	the bits with the most significance have the lowest address
	location.
	``e``
	Specifies that the target lays out data in little-endian form. That
	is, the bits with the least significance have the lowest address
	location.
	``S<size>``
	Specifies the natural alignment of the stack in bits. Alignment
	promotion of stack variables is limited to the natural stack
	alignment to avoid dynamic stack realignment. The stack alignment
	must be a multiple of 8-bits. If omitted, the natural stack
	alignment defaults to "unspecified", which does not prevent any
	alignment promotions.
	``A<address space>``
	Specifies the address space of objects created by '``alloca``'.
	Defaults to the default address space of 0.
	``p[n]:<size>:<abi>:<pref>``
	This specifies the size of a pointer and its ``<abi>`` and
	``<pref>``\erred alignments for address space ``n``. All sizes are in
	bits. The address space, ``n``, is optional, and if not specified,
	denotes the default address space 0. The value of ``n`` must be
	in the range [1,2^23).
	``i<size>:<abi>:<pref>``
	This specifies the alignment for an integer type of a given bit
	``<size>``. The value of ``<size>`` must be in the range [1,2^23).
	``v<size>:<abi>:<pref>``
	This specifies the alignment for a vector type of a given bit
	``<size>``.
	``f<size>:<abi>:<pref>``
	This specifies the alignment for a floating point type of a given bit
	``<size>``. Only values of ``<size>`` that are supported by the target
	will work. 32 (float) and 64 (double) are supported on all targets; 80
	or 128 (different flavors of long double) are also supported on some
	targets.
	``a:<abi>:<pref>``
	This specifies the alignment for an object of aggregate type.
	``m:<mangling>``
	If present, specifies that llvm names are mangled in the output. The
	options are

	* ``e``: ELF mangling: Private symbols get a ``.L`` prefix.
	* ``m``: Mips mangling: Private symbols get a ``$`` prefix.
	* ``o``: Mach-O mangling: Private symbols get ``L`` prefix. Other
	symbols get a ``_`` prefix.
	* ``w``: Windows COFF prefix: Similar to Mach-O, but stdcall and fastcall
	functions also get a suffix based on the frame size.
	* ``x``: Windows x86 COFF prefix: Similar to Windows COFF, but use a ``_``
	prefix for ``__cdecl`` functions.
	``n<size1>:<size2>:<size3>...``
	This specifies a set of native integer widths for the target CPU in
	bits. For example, it might contain ``n32`` for 32-bit PowerPC,
	``n32:64`` for PowerPC 64, or ``n8:16:32:64`` for X86-64. Elements of
	this set are considered to support most general arithmetic operations
	efficiently.
	``ni:<address space0>:<address space1>:<address space2>...``
	This specifies pointer types with the specified address spaces
	as :ref:`Non-Integral Pointer Type <nointptrtype>` s. The ``0``
	address space cannot be specified as non-integral.

	On every specification that takes a ``<abi>:<pref>``, specifying the
	``<pref>`` alignment is optional. If omitted, the preceding ``:``
	should be omitted too and ``<pref>`` will be equal to ``<abi>``.

	When constructing the data layout for a given target, LLVM starts with a
	default set of specifications which are then (possibly) overridden by
	the specifications in the ``datalayout`` keyword. The default
	specifications are given in this list:

	- ``E`` - big endian
	- ``p:64:64:64`` - 64-bit pointers with 64-bit alignment.
	- ``p[n]:64:64:64`` - Other address spaces are assumed to be the
	same as the default address space.
	- ``S0`` - natural stack alignment is unspecified
	- ``i1:8:8`` - i1 is 8-bit (byte) aligned
	- ``i8:8:8`` - i8 is 8-bit (byte) aligned
	- ``i16:16:16`` - i16 is 16-bit aligned
	- ``i32:32:32`` - i32 is 32-bit aligned
	- ``i64:32:64`` - i64 has ABI alignment of 32-bits but preferred
	alignment of 64-bits
	- ``f16:16:16`` - half is 16-bit aligned
	- ``f32:32:32`` - float is 32-bit aligned
	- ``f64:64:64`` - double is 64-bit aligned
	- ``f128:128:128`` - quad is 128-bit aligned
	- ``v64:64:64`` - 64-bit vector is 64-bit aligned
	- ``v128:128:128`` - 128-bit vector is 128-bit aligned
	- ``a:0:64`` - aggregates are 64-bit aligned

	When LLVM is determining the alignment for a given type, it uses the
	following rules:

	#. If the type sought is an exact match for one of the specifications,
	that specification is used.
	#. If no match is found, and the type sought is an integer type, then
	the smallest integer type that is larger than the bitwidth of the
	sought type is used. If none of the specifications are larger than
	the bitwidth then the largest integer type is used. For example,
	given the default specifications above, the i7 type will use the
	alignment of i8 (next largest) while both i65 and i256 will use the
	alignment of i64 (largest specified).
	#. If no match is found, and the type sought is a vector type, then the
	largest vector type that is smaller than the sought vector type will
	be used as a fall back. This happens because <128 x double> can be
	implemented in terms of 64 <2 x double>, for example.

	The function of the data layout string may not be what you expect.
	Notably, this is not a specification from the frontend of what alignment
	the code generator should use.

	Instead, if specified, the target data layout is required to match what
	the ultimate code generator expects. This string is used by the
	mid-level optimizers to improve code, and this only works if it matches
	what the ultimate code generator uses. There is no way to generate IR
	that does not embed this target-specific detail into the IR. If you
	don't specify the string, the default specifications will be used to
	generate a Data Layout and the optimization phases will operate
	accordingly and introduce target specificity into the IR with respect to
	these default specifications.

	.. _langref_triple:

	Target Triple
	-------------

	A module may specify a target triple string that describes the target
	host. The syntax for the target triple is simply:

	.. code-block:: llvm

	target triple = "x86_64-apple-macosx10.7.0"

	The target triple string consists of a series of identifiers delimited
	by the minus sign character ('-'). The canonical forms are:

	::

	ARCHITECTURE-VENDOR-OPERATING_SYSTEM
	ARCHITECTURE-VENDOR-OPERATING_SYSTEM-ENVIRONMENT

	This information is passed along to the backend so that it generates
	code for the proper architecture. It's possible to override this on the
	command line with the ``-mtriple`` command line option.

	.. _pointeraliasing:

	Pointer Aliasing Rules
	----------------------

	Any memory access must be done through a pointer value associated with
	an address range of the memory access, otherwise the behavior is
	undefined. Pointer values are associated with address ranges according
	to the following rules:

	- A pointer value is associated with the addresses associated with any
	value it is based on.
	- An address of a global variable is associated with the address range
	of the variable's storage.
	- The result value of an allocation instruction is associated with the
	address range of the allocated storage.
	- A null pointer in the default address-space is associated with no
	address.
	- An integer constant other than zero or a pointer value returned from
	a function not defined within LLVM may be associated with address
	ranges allocated through mechanisms other than those provided by
	LLVM. Such ranges shall not overlap with any ranges of addresses
	allocated by mechanisms provided by LLVM.

	A pointer value is based on another pointer value according to the
	following rules:

	- A pointer value formed from a ``getelementptr`` operation is based
	on the second value operand of the ``getelementptr``.
	- The result value of a ``bitcast`` is based on the operand of the
	``bitcast``.
	- A pointer value formed by an ``inttoptr`` is based on all pointer
	values that contribute (directly or indirectly) to the computation of
	the pointer's value.
	- The "based on" relationship is transitive.

	Note that this definition of "based" is intentionally similar to the
	definition of "based" in C99, though it is slightly weaker.

	LLVM IR does not associate types with memory. The result type of a
	``load`` merely indicates the size and alignment of the memory from
	which to load, as well as the interpretation of the value. The first
	operand type of a ``store`` similarly only indicates the size and
	alignment of the store.

	Consequently, type-based alias analysis, aka TBAA, aka
	``-fstrict-aliasing``, is not applicable to general unadorned LLVM IR.
	:ref:`Metadata <metadata>` may be used to encode additional information
	which specialized optimization passes may use to implement type-based
	alias analysis.

	.. _volatile:

	Volatile Memory Accesses
	------------------------

	Certain memory accesses, such as :ref:`load <i_load>`'s,
	:ref:`store <i_store>`'s, and :ref:`llvm.memcpy <int_memcpy>`'s may be
	marked ``volatile``. The optimizers must not change the number of
	volatile operations or change their order of execution relative to other
	volatile operations. The optimizers may change the order of volatile
	operations relative to non-volatile operations. This is not Java's
	"volatile" and has no cross-thread synchronization behavior.

	IR-level volatile loads and stores cannot safely be optimized into
	llvm.memcpy or llvm.memmove intrinsics even when those intrinsics are
	flagged volatile. Likewise, the backend should never split or merge
	target-legal volatile load/store instructions.

	.. admonition:: Rationale

	Platforms may rely on volatile loads and stores of natively supported
	data width to be executed as single instruction. For example, in C
	this holds for an l-value of volatile primitive type with native
	hardware support, but not necessarily for aggregate types. The
	frontend upholds these expectations, which are intentionally
	unspecified in the IR. The rules above ensure that IR transformations
	do not violate the frontend's contract with the language.

	.. _memmodel:

	Memory Model for Concurrent Operations
	--------------------------------------

	The LLVM IR does not define any way to start parallel threads of
	execution or to register signal handlers. Nonetheless, there are
	platform-specific ways to create them, and we define LLVM IR's behavior
	in their presence. This model is inspired by the C++0x memory model.

	For a more informal introduction to this model, see the :doc:`Atomics`.

	We define a happens-before partial order as the least partial order
	that

	- Is a superset of single-thread program order, and
	- When a synchronizes-with ``b``, includes an edge from ``a`` to
	``b``. Synchronizes-with pairs are introduced by platform-specific
	techniques, like pthread locks, thread creation, thread joining,
	etc., and by atomic instructions. (See also :ref:`Atomic Memory Ordering
	Constraints <ordering>`).

	Note that program order does not introduce happens-before edges
	between a thread and signals executing inside that thread.

	Every (defined) read operation (load instructions, memcpy, atomic
	loads/read-modify-writes, etc.) R reads a series of bytes written by
	(defined) write operations (store instructions, atomic
	stores/read-modify-writes, memcpy, etc.). For the purposes of this
	section, initialized globals are considered to have a write of the
	initializer which is atomic and happens before any other read or write
	of the memory in question. For each byte of a read R, R\ :sub:`byte`
	may see any write to the same byte, except:

	- If write\ :sub:`1` happens before write\ :sub:`2`, and
	write\ :sub:`2` happens before R\ :sub:`byte`, then
	R\ :sub:`byte` does not see write\ :sub:`1`.
	- If R\ :sub:`byte` happens before write\ :sub:`3`, then
	R\ :sub:`byte` does not see write\ :sub:`3`.

	Given that definition, R\ :sub:`byte` is defined as follows:

	- If R is volatile, the result is target-dependent. (Volatile is
	supposed to give guarantees which can support ``sig_atomic_t`` in
	C/C++, and may be used for accesses to addresses that do not behave
	like normal memory. It does not generally provide cross-thread
	synchronization.)
	- Otherwise, if there is no write to the same byte that happens before
	R\ :sub:`byte`, R\ :sub:`byte` returns ``undef`` for that byte.
	- Otherwise, if R\ :sub:`byte` may see exactly one write,
	R\ :sub:`byte` returns the value written by that write.
	- Otherwise, if R is atomic, and all the writes R\ :sub:`byte` may
	see are atomic, it chooses one of the values written. See the :ref:`Atomic
	Memory Ordering Constraints <ordering>` section for additional
	constraints on how the choice is made.
	- Otherwise R\ :sub:`byte` returns ``undef``.

	R returns the value composed of the series of bytes it read. This
	implies that some bytes within the value may be ``undef`` without
	the entire value being ``undef``. Note that this only defines the
	semantics of the operation; it doesn't mean that targets will emit more
	than one instruction to read the series of bytes.

	Note that in cases where none of the atomic intrinsics are used, this
	model places only one restriction on IR transformations on top of what
	is required for single-threaded execution: introducing a store to a byte
	which might not otherwise be stored is not allowed in general.
	(Specifically, in the case where another thread might write to and read
	from an address, introducing a store can change a load that may see
	exactly one write into a load that may see multiple writes.)

	.. _ordering:

	Atomic Memory Ordering Constraints
	----------------------------------

	Atomic instructions (:ref:`cmpxchg <i_cmpxchg>`,
	:ref:`atomicrmw <i_atomicrmw>`, :ref:`fence <i_fence>`,
	:ref:`atomic load <i_load>`, and :ref:`atomic store <i_store>`) take
	ordering parameters that determine which other atomic instructions on
	the same address they synchronize with. These semantics are borrowed
	from Java and C++0x, but are somewhat more colloquial. If these
	descriptions aren't precise enough, check those specs (see spec
	references in the :doc:`atomics guide <Atomics>`).
	:ref:`fence <i_fence>` instructions treat these orderings somewhat
	differently since they don't take an address. See that instruction's
	documentation for details.

	For a simpler introduction to the ordering constraints, see the
	:doc:`Atomics`.

	``unordered``
	The set of values that can be read is governed by the happens-before
	partial order. A value cannot be read unless some operation wrote
	it. This is intended to provide a guarantee strong enough to model
	Java's non-volatile shared variables. This ordering cannot be
	specified for read-modify-write operations; it is not strong enough
	to make them atomic in any interesting way.
	``monotonic``
	In addition to the guarantees of ``unordered``, there is a single
	total order for modifications by ``monotonic`` operations on each
	address. All modification orders must be compatible with the
	happens-before order. There is no guarantee that the modification
	orders can be combined to a global total order for the whole program
	(and this often will not be possible). The read in an atomic
	read-modify-write operation (:ref:`cmpxchg <i_cmpxchg>` and
	:ref:`atomicrmw <i_atomicrmw>`) reads the value in the modification
	order immediately before the value it writes. If one atomic read
	happens before another atomic read of the same address, the later
	read must see the same value or a later value in the address's
	modification order. This disallows reordering of ``monotonic`` (or
	stronger) operations on the same address. If an address is written
	``monotonic``-ally by one thread, and other threads ``monotonic``-ally
	read that address repeatedly, the other threads must eventually see
	the write. This corresponds to the C++0x/C1x
	``memory_order_relaxed``.
	``acquire``
	In addition to the guarantees of ``monotonic``, a
	synchronizes-with edge may be formed with a ``release`` operation.
	This is intended to model C++'s ``memory_order_acquire``.
	``release``
	In addition to the guarantees of ``monotonic``, if this operation
	writes a value which is subsequently read by an ``acquire``
	operation, it synchronizes-with that operation. (This isn't a
	complete description; see the C++0x definition of a release
	sequence.) This corresponds to the C++0x/C1x
	``memory_order_release``.
	``acq_rel`` (acquire+release)
	Acts as both an ``acquire`` and ``release`` operation on its
	address. This corresponds to the C++0x/C1x ``memory_order_acq_rel``.
	``seq_cst`` (sequentially consistent)
	In addition to the guarantees of ``acq_rel`` (``acquire`` for an
	operation that only reads, ``release`` for an operation that only
	writes), there is a global total order on all
	sequentially-consistent operations on all addresses, which is
	consistent with the happens-before partial order and with the
	modification orders of all the affected addresses. Each
	sequentially-consistent read sees the last preceding write to the
	same address in this global order. This corresponds to the C++0x/C1x
	``memory_order_seq_cst`` and Java volatile.

	.. _syncscope:

	If an atomic operation is marked ``syncscope("singlethread")``, it only
	synchronizes with and only participates in the seq\_cst total orderings of
	other operations running in the same thread (for example, in signal handlers).

	If an atomic operation is marked ``syncscope("<target-scope>")``, where
	``<target-scope>`` is a target specific synchronization scope, then it is target
	dependent if it synchronizes with and participates in the seq\_cst total
	orderings of other operations.

	Otherwise, an atomic operation that is not marked ``syncscope("singlethread")``
	or ``syncscope("<target-scope>")`` synchronizes with and participates in the
	seq\_cst total orderings of other operations that are not marked
	``syncscope("singlethread")`` or ``syncscope("<target-scope>")``.

	.. _fastmath:

	Fast-Math Flags
	---------------

	LLVM IR floating-point binary ops (:ref:`fadd <i_fadd>`,
	:ref:`fsub <i_fsub>`, :ref:`fmul <i_fmul>`, :ref:`fdiv <i_fdiv>`,
	:ref:`frem <i_frem>`, :ref:`fcmp <i_fcmp>`) and :ref:`call <i_call>`
	instructions have the following flags that can be set to enable
	otherwise unsafe floating point transformations.

	``nnan``
	No NaNs - Allow optimizations to assume the arguments and result are not
	NaN. Such optimizations are required to retain defined behavior over
	NaNs, but the value of the result is undefined.

	``ninf``
	No Infs - Allow optimizations to assume the arguments and result are not
	+/-Inf. Such optimizations are required to retain defined behavior over
	+/-Inf, but the value of the result is undefined.

	``nsz``
	No Signed Zeros - Allow optimizations to treat the sign of a zero
	argument or result as insignificant.

	``arcp``
	Allow Reciprocal - Allow optimizations to use the reciprocal of an
	argument rather than perform division.

	``contract``
	Allow floating-point contraction (e.g. fusing a multiply followed by an
	addition into a fused multiply-and-add).

	``fast``
	Fast - Allow algebraically equivalent transformations that may
	dramatically change results in floating point (e.g. reassociate). This
	flag implies all the others.

	.. _uselistorder:

	Use-list Order Directives
	-------------------------

	Use-list directives encode the in-memory order of each use-list, allowing the
	order to be recreated. ``<order-indexes>`` is a comma-separated list of
	indexes that are assigned to the referenced value's uses. The referenced
	value's use-list is immediately sorted by these indexes.

	Use-list directives may appear at function scope or global scope. They are not
	instructions, and have no effect on the semantics of the IR. When they're at
	function scope, they must appear after the terminator of the final basic block.

	If basic blocks have their address taken via ``blockaddress()`` expressions,
	``uselistorder_bb`` can be used to reorder their use-lists from outside their
	function's scope.

	:Syntax:

	::

	uselistorder <ty> <value>, { <order-indexes> }
	uselistorder_bb @function, %block { <order-indexes> }

	:Examples:

	::

	define void @foo(i32 %arg1, i32 %arg2) {
	entry:
	; ... instructions ...
	bb:
	; ... instructions ...

	; At function scope.
	uselistorder i32 %arg1, { 1, 0, 2 }
	uselistorder label %bb, { 1, 0 }
	}

	; At global scope.
	uselistorder i32* @global, { 1, 2, 0 }
	uselistorder i32 7, { 1, 0 }
	uselistorder i32 (i32) @bar, { 1, 0 }
	uselistorder_bb @foo, %bb, { 5, 1, 3, 2, 0, 4 }

	.. _source_filename:

	Source Filename
	---------------

	The source filename string is set to the original module identifier,
	which will be the name of the compiled source file when compiling from
	source through the clang front end, for example. It is then preserved through
	the IR and bitcode.

	This is currently necessary to generate a consistent unique global
	identifier for local functions used in profile data, which prepends the
	source file name to the local function name.

	The syntax for the source file name is simply:

	.. code-block:: text

	source_filename = "/path/to/source.c"

	.. _typesystem:

	Type System
	===========

	The LLVM type system is one of the most important features of the
	intermediate representation. Being typed enables a number of
	optimizations to be performed on the intermediate representation
	directly, without having to do extra analyses on the side before the
	transformation. A strong type system makes it easier to read the
	generated code and enables novel analyses and transformations that are
	not feasible to perform on normal three address code representations.

	.. _t_void:

	Void Type
	---------

	:Overview:


	The void type does not represent any value and has no size.

	:Syntax:


	::

	void


	.. _t_function:

	Function Type
	-------------

	:Overview:


	The function type can be thought of as a function signature. It consists of a
	return type and a list of formal parameter types. The return type of a function
	type is a void type or first class type --- except for :ref:`label <t_label>`
	and :ref:`metadata <t_metadata>` types.

	:Syntax:

	::

	<returntype> (<parameter list>)

	...where '``<parameter list>``' is a comma-separated list of type
	specifiers. Optionally, the parameter list may include a type ``...``, which
	indicates that the function takes a variable number of arguments. Variable
	argument functions can access their arguments with the :ref:`variable argument
	handling intrinsic <int_varargs>` functions. '``<returntype>``' is any type
	except :ref:`label <t_label>` and :ref:`metadata <t_metadata>`.

	:Examples:

	+---------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------+
	\| ``i32 (i32)`` \| function taking an ``i32``, returning an ``i32`` \|
	+---------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------+
	\| ``float (i16, i32 ) `` \| :ref:`Pointer <t_pointer>` to a function that takes an ``i16`` and a :ref:`pointer <t_pointer>` to ``i32``, returning ``float``. \|
	+---------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------+
	\| ``i32 (i8*, ...)`` \| A vararg function that takes at least one :ref:`pointer <t_pointer>` to ``i8`` (char in C), which returns an integer. This is the signature for ``printf`` in LLVM. \|
	+---------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------+
	\| ``{i32, i32} (i32)`` \| A function taking an ``i32``, returning a :ref:`structure <t_struct>` containing two ``i32`` values \|
	+---------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------+

	.. _t_firstclass:

	First Class Types
	-----------------

	The :ref:`first class <t_firstclass>` types are perhaps the most important.
	Values of these types are the only ones which can be produced by
	instructions.

	.. _t_single_value:

	Single Value Types
	^^^^^^^^^^^^^^^^^^

	These are the types that are valid in registers from CodeGen's perspective.

	.. _t_integer:

	Integer Type
	""""""""""""

	:Overview:

	The integer type is a very simple type that simply specifies an
	arbitrary bit width for the integer type desired. Any bit width from 1
	bit to 2\ :sup:`23`\ -1 (about 8 million) can be specified.

	:Syntax:

	::

	iN

	The number of bits the integer will occupy is specified by the ``N``
	value.

	Examples:
	*********

	+----------------+------------------------------------------------+
	\| ``i1`` \| a single-bit integer. \|
	+----------------+------------------------------------------------+
	\| ``i32`` \| a 32-bit integer. \|
	+----------------+------------------------------------------------+
	\| ``i1942652`` \| a really big integer of over 1 million bits. \|
	+----------------+------------------------------------------------+

	.. _t_floating:

	Floating Point Types
	""""""""""""""""""""

	.. list-table::
	:header-rows: 1

	* - Type
	- Description

	* - ``half``
	- 16-bit floating point value

	* - ``float``
	- 32-bit floating point value

	* - ``double``
	- 64-bit floating point value

	* - ``fp128``
	- 128-bit floating point value (112-bit mantissa)

	* - ``x86_fp80``
	- 80-bit floating point value (X87)

	* - ``ppc_fp128``
	- 128-bit floating point value (two 64-bits)

	X86_mmx Type
	""""""""""""

	:Overview:

	The x86_mmx type represents a value held in an MMX register on an x86
	machine. The operations allowed on it are quite limited: parameters and
	return values, load and store, and bitcast. User-specified MMX
	instructions are represented as intrinsic or asm calls with arguments
	and/or results of this type. There are no arrays, vectors or constants
	of this type.

	:Syntax:

	::

	x86_mmx


	.. _t_pointer:

	Pointer Type
	""""""""""""

	:Overview:

	The pointer type is used to specify memory locations. Pointers are
	commonly used to reference objects in memory.

	Pointer types may have an optional address space attribute defining the
	numbered address space where the pointed-to object resides. The default
	address space is number zero. The semantics of non-zero address spaces
	are target-specific.

	Note that LLVM does not permit pointers to void (``void*``) nor does it
	permit pointers to labels (``label``). Use ``i8`` instead.

	:Syntax:

	::

	<type> *

	:Examples:

	+-------------------------+--------------------------------------------------------------------------------------------------------------+
	\| ``[4 x i32]*`` \| A :ref:`pointer <t_pointer>` to :ref:`array <t_array>` of four ``i32`` values. \|
	+-------------------------+--------------------------------------------------------------------------------------------------------------+
	\| ``i32 (i32) `` \| A :ref:`pointer <t_pointer>` to a :ref:`function <t_function>` that takes an ``i32*``, returning an ``i32``. \|
	+-------------------------+--------------------------------------------------------------------------------------------------------------+
	\| ``i32 addrspace(5)*`` \| A :ref:`pointer <t_pointer>` to an ``i32`` value that resides in address space #5. \|
	+-------------------------+--------------------------------------------------------------------------------------------------------------+

	.. _t_vector:

	Vector Type
	"""""""""""

	:Overview:

	A vector type is a simple derived type that represents a vector of
	elements. Vector types are used when multiple primitive data are
	operated in parallel using a single instruction (SIMD). A vector type
	requires a size (number of elements) and an underlying primitive data
	type. Vector types are considered :ref:`first class <t_firstclass>`.

	:Syntax:

	::

	< <# elements> x <elementtype> >

	The number of elements is a constant integer value larger than 0;
	elementtype may be any integer, floating point or pointer type. Vectors
	of size zero are not allowed.

	:Examples:

	+-------------------+--------------------------------------------------+
	\| ``<4 x i32>`` \| Vector of 4 32-bit integer values. \|
	+-------------------+--------------------------------------------------+
	\| ``<8 x float>`` \| Vector of 8 32-bit floating-point values. \|
	+-------------------+--------------------------------------------------+
	\| ``<2 x i64>`` \| Vector of 2 64-bit integer values. \|
	+-------------------+--------------------------------------------------+
	\| ``<4 x i64*>`` \| Vector of 4 pointers to 64-bit integer values. \|
	+-------------------+--------------------------------------------------+

	.. _t_label:

	Label Type
	^^^^^^^^^^

	:Overview:

	The label type represents code labels.

	:Syntax:

	::

	label

	.. _t_token:

	Token Type
	^^^^^^^^^^

	:Overview:

	The token type is used when a value is associated with an instruction
	but all uses of the value must not attempt to introspect or obscure it.
	As such, it is not appropriate to have a :ref:`phi <i_phi>` or
	:ref:`select <i_select>` of type token.

	:Syntax:

	::

	token



	.. _t_metadata:

	Metadata Type
	^^^^^^^^^^^^^

	:Overview:

	The metadata type represents embedded metadata. No derived types may be
	created from metadata except for :ref:`function <t_function>` arguments.

	:Syntax:

	::

	metadata

	.. _t_aggregate:

	Aggregate Types
	^^^^^^^^^^^^^^^

	Aggregate Types are a subset of derived types that can contain multiple
	member types. :ref:`Arrays <t_array>` and :ref:`structs <t_struct>` are
	aggregate types. :ref:`Vectors <t_vector>` are not considered to be
	aggregate types.

	.. _t_array:

	Array Type
	""""""""""

	:Overview:

	The array type is a very simple derived type that arranges elements
	sequentially in memory. The array type requires a size (number of
	elements) and an underlying data type.

	:Syntax:

	::

	[<# elements> x <elementtype>]

	The number of elements is a constant integer value; ``elementtype`` may
	be any type with a size.

	:Examples:

	+------------------+--------------------------------------+
	\| ``[40 x i32]`` \| Array of 40 32-bit integer values. \|
	+------------------+--------------------------------------+
	\| ``[41 x i32]`` \| Array of 41 32-bit integer values. \|
	+------------------+--------------------------------------+
	\| ``[4 x i8]`` \| Array of 4 8-bit integer values. \|
	+------------------+--------------------------------------+

	Here are some examples of multidimensional arrays:

	+-----------------------------+----------------------------------------------------------+
	\| ``[3 x [4 x i32]]`` \| 3x4 array of 32-bit integer values. \|
	+-----------------------------+----------------------------------------------------------+
	\| ``[12 x [10 x float]]`` \| 12x10 array of single precision floating point values. \|
	+-----------------------------+----------------------------------------------------------+
	\| ``[2 x [3 x [4 x i16]]]`` \| 2x3x4 array of 16-bit integer values. \|
	+-----------------------------+----------------------------------------------------------+

	There is no restriction on indexing beyond the end of the array implied
	by a static type (though there are restrictions on indexing beyond the
	bounds of an allocated object in some cases). This means that
	single-dimension 'variable sized array' addressing can be implemented in
	LLVM with a zero length array type. An implementation of 'pascal style
	arrays' in LLVM could use the type "``{ i32, [0 x float]}``", for
	example.

	.. _t_struct:

	Structure Type
	""""""""""""""

	:Overview:

	The structure type is used to represent a collection of data members
	together in memory. The elements of a structure may be any type that has
	a size.

	Structures in memory are accessed using '``load``' and '``store``' by
	getting a pointer to a field with the '``getelementptr``' instruction.
	Structures in registers are accessed using the '``extractvalue``' and
	'``insertvalue``' instructions.

	Structures may optionally be "packed" structures, which indicate that
	the alignment of the struct is one byte, and that there is no padding
	between the elements. In non-packed structs, padding between field types
	is inserted as defined by the DataLayout string in the module, which is
	required to match what the underlying code generator expects.

	Structures can either be "literal" or "identified". A literal structure
	is defined inline with other types (e.g. ``{i32, i32}*``) whereas
	identified types are always defined at the top level with a name.
	Literal types are uniqued by their contents and can never be recursive
	or opaque since there is no way to write one. Identified types can be
	recursive, can be opaqued, and are never uniqued.

	:Syntax:

	::

	%T1 = type { <type list> } ; Identified normal struct type
	%T2 = type <{ <type list> }> ; Identified packed struct type

	:Examples:

	+------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
	\| ``{ i32, i32, i32 }`` \| A triple of three ``i32`` values \|
	+------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
	\| ``{ float, i32 (i32) * }`` \| A pair, where the first element is a ``float`` and the second element is a :ref:`pointer <t_pointer>` to a :ref:`function <t_function>` that takes an ``i32``, returning an ``i32``. \|
	+------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
	\| ``<{ i8, i32 }>`` \| A packed struct known to be 5 bytes in size. \|
	+------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+

	.. _t_opaque:

	Opaque Structure Types
	""""""""""""""""""""""

	:Overview:

	Opaque structure types are used to represent named structure types that
	do not have a body specified. This corresponds (for example) to the C
	notion of a forward declared structure.

	:Syntax:

	::

	%X = type opaque
	%52 = type opaque

	:Examples:

	+--------------+-------------------+
	\| ``opaque`` \| An opaque type. \|
	+--------------+-------------------+

	.. _constants:

	Constants
	=========

	LLVM has several different basic types of constants. This section
	describes them all and their syntax.

	Simple Constants
	----------------

	Boolean constants
	The two strings '``true``' and '``false``' are both valid constants
	of the ``i1`` type.
	Integer constants
	Standard integers (such as '4') are constants of the
	:ref:`integer <t_integer>` type. Negative numbers may be used with
	integer types.
	Floating point constants
	Floating point constants use standard decimal notation (e.g.
	123.421), exponential notation (e.g. 1.23421e+2), or a more precise
	hexadecimal notation (see below). The assembler requires the exact
	decimal value of a floating-point constant. For example, the
	assembler accepts 1.25 but rejects 1.3 because 1.3 is a repeating
	decimal in binary. Floating point constants must have a :ref:`floating
	point <t_floating>` type.
	Null pointer constants
	The identifier '``null``' is recognized as a null pointer constant
	and must be of :ref:`pointer type <t_pointer>`.
	Token constants
	The identifier '``none``' is recognized as an empty token constant
	and must be of :ref:`token type <t_token>`.

	The one non-intuitive notation for constants is the hexadecimal form of
	floating point constants. For example, the form
	'``double 0x432ff973cafa8000``' is equivalent to (but harder to read
	than) '``double 4.5e+15``'. The only time hexadecimal floating point
	constants are required (and the only time that they are generated by the
	disassembler) is when a floating point constant must be emitted but it
	cannot be represented as a decimal floating point number in a reasonable
	number of digits. For example, NaN's, infinities, and other special
	values are represented in their IEEE hexadecimal format so that assembly
	and disassembly do not cause any bits to change in the constants.

	When using the hexadecimal form, constants of types half, float, and
	double are represented using the 16-digit form shown above (which
	matches the IEEE754 representation for double); half and float values
	must, however, be exactly representable as IEEE 754 half and single
	precision, respectively. Hexadecimal format is always used for long
	double, and there are three forms of long double. The 80-bit format used
	by x86 is represented as ``0xK`` followed by 20 hexadecimal digits. The
	128-bit format used by PowerPC (two adjacent doubles) is represented by
	``0xM`` followed by 32 hexadecimal digits. The IEEE 128-bit format is
	represented by ``0xL`` followed by 32 hexadecimal digits. Long doubles
	will only work if they match the long double format on your target.
	The IEEE 16-bit format (half precision) is represented by ``0xH``
	followed by 4 hexadecimal digits. All hexadecimal formats are big-endian
	(sign bit at the left).

	There are no constants of type x86_mmx.

	.. _complexconstants:

	Complex Constants
	-----------------

	Complex constants are a (potentially recursive) combination of simple
	constants and smaller complex constants.

	Structure constants
	Structure constants are represented with notation similar to
	structure type definitions (a comma separated list of elements,
	surrounded by braces (``{}``)). For example:
	"``{ i32 4, float 17.0, i32* @G }``", where "``@G``" is declared as
	"``@G = external global i32``". Structure constants must have
	:ref:`structure type <t_struct>`, and the number and types of elements
	must match those specified by the type.
	Array constants
	Array constants are represented with notation similar to array type
	definitions (a comma separated list of elements, surrounded by
	square brackets (``[]``)). For example:
	"``[ i32 42, i32 11, i32 74 ]``". Array constants must have
	:ref:`array type <t_array>`, and the number and types of elements must
	match those specified by the type. As a special case, character array
	constants may also be represented as a double-quoted string using the ``c``
	prefix. For example: "``c"Hello World\0A\00"``".
	Vector constants
	Vector constants are represented with notation similar to vector
	type definitions (a comma separated list of elements, surrounded by
	less-than/greater-than's (``<>``)). For example:
	"``< i32 42, i32 11, i32 74, i32 100 >``". Vector constants
	must have :ref:`vector type <t_vector>`, and the number and types of
	elements must match those specified by the type.
	Zero initialization
	The string '``zeroinitializer``' can be used to zero initialize a
	value to zero of any type, including scalar and
	:ref:`aggregate <t_aggregate>` types. This is often used to avoid
	having to print large zero initializers (e.g. for large arrays) and
	is always exactly equivalent to using explicit zero initializers.
	Metadata node
	A metadata node is a constant tuple without types. For example:
	"``!{!0, !{!2, !0}, !"test"}``". Metadata can reference constant values,
	for example: "``!{!0, i32 0, i8* @global, i64 (i64)* @function, !"str"}``".
	Unlike other typed constants that are meant to be interpreted as part of
	the instruction stream, metadata is a place to attach additional
	information such as debug info.

	Global Variable and Function Addresses
	--------------------------------------

	The addresses of :ref:`global variables <globalvars>` and
	:ref:`functions <functionstructure>` are always implicitly valid
	(link-time) constants. These constants are explicitly referenced when
	the :ref:`identifier for the global <identifiers>` is used and always have
	:ref:`pointer <t_pointer>` type. For example, the following is a legal LLVM
	file:

	.. code-block:: llvm

	@X = global i32 17
	@Y = global i32 42
	@Z = global [2 x i32] [ i32 @X, i32* @Y ]

	.. _undefvalues:

	Undefined Values
	----------------

	The string '``undef``' can be used anywhere a constant is expected, and
	indicates that the user of the value may receive an unspecified
	bit-pattern. Undefined values may be of any type (other than '``label``'
	or '``void``') and be used anywhere a constant is permitted.

	Undefined values are useful because they indicate to the compiler that
	the program is well defined no matter what value is used. This gives the
	compiler more freedom to optimize. Here are some examples of
	(potentially surprising) transformations that are valid (in pseudo IR):

	.. code-block:: llvm

	%A = add %X, undef
	%B = sub %X, undef
	%C = xor %X, undef
	Safe:
	%A = undef
	%B = undef
	%C = undef

	This is safe because all of the output bits are affected by the undef
	bits. Any output bit can have a zero or one depending on the input bits.

	.. code-block:: llvm

	%A = or %X, undef
	%B = and %X, undef
	Safe:
	%A = -1
	%B = 0
	Safe:
	%A = %X ;; By choosing undef as 0
	%B = %X ;; By choosing undef as -1
	Unsafe:
	%A = undef
	%B = undef

	These logical operations have bits that are not always affected by the
	input. For example, if ``%X`` has a zero bit, then the output of the
	'``and``' operation will always be a zero for that bit, no matter what
	the corresponding bit from the '``undef``' is. As such, it is unsafe to
	optimize or assume that the result of the '``and``' is '``undef``'.
	However, it is safe to assume that all bits of the '``undef``' could be
	0, and optimize the '``and``' to 0. Likewise, it is safe to assume that
	all the bits of the '``undef``' operand to the '``or``' could be set,
	allowing the '``or``' to be folded to -1.

	.. code-block:: llvm

	%A = select undef, %X, %Y
	%B = select undef, 42, %Y
	%C = select %X, %Y, undef
	Safe:
	%A = %X (or %Y)
	%B = 42 (or %Y)
	%C = %Y
	Unsafe:
	%A = undef
	%B = undef
	%C = undef

	This set of examples shows that undefined '``select``' (and conditional
	branch) conditions can go either way, but they have to come from one
	of the two operands. In the ``%A`` example, if ``%X`` and ``%Y`` were
	both known to have a clear low bit, then ``%A`` would have to have a
	cleared low bit. However, in the ``%C`` example, the optimizer is
	allowed to assume that the '``undef``' operand could be the same as
	``%Y``, allowing the whole '``select``' to be eliminated.

	.. code-block:: text

	%A = xor undef, undef

	%B = undef
	%C = xor %B, %B

	%D = undef
	%E = icmp slt %D, 4
	%F = icmp gte %D, 4

	Safe:
	%A = undef
	%B = undef
	%C = undef
	%D = undef
	%E = undef
	%F = undef

	This example points out that two '``undef``' operands are not
	necessarily the same. This can be surprising to people (and also matches
	C semantics) where they assume that "``X^X``" is always zero, even if
	``X`` is undefined. This isn't true for a number of reasons, but the
	short answer is that an '``undef``' "variable" can arbitrarily change
	its value over its "live range". This is true because the variable
	doesn't actually have a live range. Instead, the value is logically
	read from arbitrary registers that happen to be around when needed, so
	the value is not necessarily consistent over time. In fact, ``%A`` and
	``%C`` need to have the same semantics or the core LLVM "replace all
	uses with" concept would not hold.

	.. code-block:: llvm

	%A = fdiv undef, %X
	%B = fdiv %X, undef
	Safe:
	%A = undef
	b: unreachable

	These examples show the crucial difference between an undefined value
	and undefined behavior. An undefined value (like '``undef``') is
	allowed to have an arbitrary bit-pattern. This means that the ``%A``
	operation can be constant folded to '``undef``', because the '``undef``'
	could be an SNaN, and ``fdiv`` is not (currently) defined on SNaN's.
	However, in the second example, we can make a more aggressive
	assumption: because the ``undef`` is allowed to be an arbitrary value,
	we are allowed to assume that it could be zero. Since a divide by zero
	has undefined behavior, we are allowed to assume that the operation
	does not execute at all. This allows us to delete the divide and all
	code after it. Because the undefined operation "can't happen", the
	optimizer can assume that it occurs in dead code.

	.. code-block:: text

	a: store undef -> %X
	b: store %X -> undef
	Safe:
	a: <deleted>
	b: unreachable

	These examples reiterate the ``fdiv`` example: a store of an undefined
	value can be assumed to not have any effect; we can assume that the
	value is overwritten with bits that happen to match what was already
	there. However, a store to an undefined location could clobber
	arbitrary memory, therefore, it has undefined behavior.

	.. _poisonvalues:

	Poison Values
	-------------

	Poison values are similar to :ref:`undef values <undefvalues>`, however
	they also represent the fact that an instruction or constant expression
	that cannot evoke side effects has nevertheless detected a condition
	that results in undefined behavior.

	There is currently no way of representing a poison value in the IR; they
	only exist when produced by operations such as :ref:`add <i_add>` with
	the ``nsw`` flag.

	Poison value behavior is defined in terms of value dependence:

	- Values other than :ref:`phi <i_phi>` nodes depend on their operands.
	- :ref:`Phi <i_phi>` nodes depend on the operand corresponding to
	their dynamic predecessor basic block.
	- Function arguments depend on the corresponding actual argument values
	in the dynamic callers of their functions.
	- :ref:`Call <i_call>` instructions depend on the :ref:`ret <i_ret>`
	instructions that dynamically transfer control back to them.
	- :ref:`Invoke <i_invoke>` instructions depend on the
	:ref:`ret <i_ret>`, :ref:`resume <i_resume>`, or exception-throwing
	call instructions that dynamically transfer control back to them.
	- Non-volatile loads and stores depend on the most recent stores to all
	of the referenced memory addresses, following the order in the IR
	(including loads and stores implied by intrinsics such as
	:ref:`@llvm.memcpy <int_memcpy>`.)
	- An instruction with externally visible side effects depends on the
	most recent preceding instruction with externally visible side
	effects, following the order in the IR. (This includes :ref:`volatile
	operations <volatile>`.)
	- An instruction control-depends on a :ref:`terminator
	instruction <terminators>` if the terminator instruction has
	multiple successors and the instruction is always executed when
	control transfers to one of the successors, and may not be executed
	when control is transferred to another.
	- Additionally, an instruction also control-depends on a terminator
	instruction if the set of instructions it otherwise depends on would
	be different if the terminator had transferred control to a different
	successor.
	- Dependence is transitive.

	Poison values have the same behavior as :ref:`undef values <undefvalues>`,
	with the additional effect that any instruction that has a dependence
	on a poison value has undefined behavior.

	Here are some examples:

	.. code-block:: llvm

	entry:
	%poison = sub nuw i32 0, 1 ; Results in a poison value.
	%still_poison = and i32 %poison, 0 ; 0, but also poison.
	%poison_yet_again = getelementptr i32, i32* @h, i32 %still_poison
	store i32 0, i32* %poison_yet_again ; memory at @h[0] is poisoned

	store i32 %poison, i32* @g ; Poison value stored to memory.
	%poison2 = load i32, i32* @g ; Poison value loaded back from memory.

	store volatile i32 %poison, i32* @g ; External observation; undefined behavior.

	%narrowaddr = bitcast i32* @g to i16*
	%wideaddr = bitcast i32* @g to i64*
	%poison3 = load i16, i16* %narrowaddr ; Returns a poison value.
	%poison4 = load i64, i64* %wideaddr ; Returns a poison value.

	%cmp = icmp slt i32 %poison, 0 ; Returns a poison value.
	br i1 %cmp, label %true, label %end ; Branch to either destination.

	true:
	store volatile i32 0, i32* @g ; This is control-dependent on %cmp, so
	; it has undefined behavior.
	br label %end

	end:
	%p = phi i32 [ 0, %entry ], [ 1, %true ]
	; Both edges into this PHI are
	; control-dependent on %cmp, so this
	; always results in a poison value.

	store volatile i32 0, i32* @g ; This would depend on the store in %true
	; if %cmp is true, or the store in %entry
	; otherwise, so this is undefined behavior.

	br i1 %cmp, label %second_true, label %second_end
	; The same branch again, but this time the
	; true block doesn't have side effects.

	second_true:
	; No side effects!
	ret void

	second_end:
	store volatile i32 0, i32* @g ; This time, the instruction always depends
	; on the store in %end. Also, it is
	; control-equivalent to %end, so this is
	; well-defined (ignoring earlier undefined
	; behavior in this example).

	.. _blockaddress:

	Addresses of Basic Blocks
	-------------------------

	``blockaddress(@function, %block)``

	The '``blockaddress``' constant computes the address of the specified
	basic block in the specified function, and always has an ``i8*`` type.
	Taking the address of the entry block is illegal.

	This value only has defined behavior when used as an operand to the
	':ref:`indirectbr <i_indirectbr>`' instruction, or for comparisons
	against null. Pointer equality tests between labels addresses results in
	undefined behavior --- though, again, comparison against null is ok, and
	no label is equal to the null pointer. This may be passed around as an
	opaque pointer sized value as long as the bits are not inspected. This
	allows ``ptrtoint`` and arithmetic to be performed on these values so
	long as the original value is reconstituted before the ``indirectbr``
	instruction.

	Finally, some targets may provide defined semantics when using the value
	as the operand to an inline assembly, but that is target specific.

	.. _constantexprs:

	Constant Expressions
	--------------------

	Constant expressions are used to allow expressions involving other
	constants to be used as constants. Constant expressions may be of any
	:ref:`first class <t_firstclass>` type and may involve any LLVM operation
	that does not have side effects (e.g. load and call are not supported).
	The following is the syntax for constant expressions:

	``trunc (CST to TYPE)``
	Truncate a constant to another type. The bit size of CST must be
	larger than the bit size of TYPE. Both types must be integers.
	``zext (CST to TYPE)``
	Zero extend a constant to another type. The bit size of CST must be
	smaller than the bit size of TYPE. Both types must be integers.
	``sext (CST to TYPE)``
	Sign extend a constant to another type. The bit size of CST must be
	smaller than the bit size of TYPE. Both types must be integers.
	``fptrunc (CST to TYPE)``
	Truncate a floating point constant to another floating point type.
	The size of CST must be larger than the size of TYPE. Both types
	must be floating point.
	``fpext (CST to TYPE)``
	Floating point extend a constant to another type. The size of CST
	must be smaller or equal to the size of TYPE. Both types must be
	floating point.
	``fptoui (CST to TYPE)``
	Convert a floating point constant to the corresponding unsigned
	integer constant. TYPE must be a scalar or vector integer type. CST
	must be of scalar or vector floating point type. Both CST and TYPE
	must be scalars, or vectors of the same number of elements. If the
	value won't fit in the integer type, the results are undefined.
	``fptosi (CST to TYPE)``
	Convert a floating point constant to the corresponding signed
	integer constant. TYPE must be a scalar or vector integer type. CST
	must be of scalar or vector floating point type. Both CST and TYPE
	must be scalars, or vectors of the same number of elements. If the
	value won't fit in the integer type, the results are undefined.
	``uitofp (CST to TYPE)``
	Convert an unsigned integer constant to the corresponding floating
	point constant. TYPE must be a scalar or vector floating point type.
	CST must be of scalar or vector integer type. Both CST and TYPE must
	be scalars, or vectors of the same number of elements. If the value
	won't fit in the floating point type, the results are undefined.
	``sitofp (CST to TYPE)``
	Convert a signed integer constant to the corresponding floating
	point constant. TYPE must be a scalar or vector floating point type.
	CST must be of scalar or vector integer type. Both CST and TYPE must
	be scalars, or vectors of the same number of elements. If the value
	won't fit in the floating point type, the results are undefined.
	``ptrtoint (CST to TYPE)``
	Convert a pointer typed constant to the corresponding integer
	constant. ``TYPE`` must be an integer type. ``CST`` must be of
	pointer type. The ``CST`` value is zero extended, truncated, or
	unchanged to make it fit in ``TYPE``.
	``inttoptr (CST to TYPE)``
	Convert an integer constant to a pointer constant. TYPE must be a
	pointer type. CST must be of integer type. The CST value is zero
	extended, truncated, or unchanged to make it fit in a pointer size.
	This one is really dangerous!
	``bitcast (CST to TYPE)``
	Convert a constant, CST, to another TYPE. The constraints of the
	operands are the same as those for the :ref:`bitcast
	instruction <i_bitcast>`.
	``addrspacecast (CST to TYPE)``
	Convert a constant pointer or constant vector of pointer, CST, to another
	TYPE in a different address space. The constraints of the operands are the
	same as those for the :ref:`addrspacecast instruction <i_addrspacecast>`.
	``getelementptr (TY, CSTPTR, IDX0, IDX1, ...)``, ``getelementptr inbounds (TY, CSTPTR, IDX0, IDX1, ...)``
	Perform the :ref:`getelementptr operation <i_getelementptr>` on
	constants. As with the :ref:`getelementptr <i_getelementptr>`
	instruction, the index list may have one or more indexes, which are
	required to make sense for the type of "pointer to TY".
	``select (COND, VAL1, VAL2)``
	Perform the :ref:`select operation <i_select>` on constants.
	``icmp COND (VAL1, VAL2)``
	Performs the :ref:`icmp operation <i_icmp>` on constants.
	``fcmp COND (VAL1, VAL2)``
	Performs the :ref:`fcmp operation <i_fcmp>` on constants.
	``extractelement (VAL, IDX)``
	Perform the :ref:`extractelement operation <i_extractelement>` on
	constants.
	``insertelement (VAL, ELT, IDX)``
	Perform the :ref:`insertelement operation <i_insertelement>` on
	constants.
	``shufflevector (VEC1, VEC2, IDXMASK)``
	Perform the :ref:`shufflevector operation <i_shufflevector>` on
	constants.
	``extractvalue (VAL, IDX0, IDX1, ...)``
	Perform the :ref:`extractvalue operation <i_extractvalue>` on
	constants. The index list is interpreted in a similar manner as
	indices in a ':ref:`getelementptr <i_getelementptr>`' operation. At
	least one index value must be specified.
	``insertvalue (VAL, ELT, IDX0, IDX1, ...)``
	Perform the :ref:`insertvalue operation <i_insertvalue>` on constants.
	The index list is interpreted in a similar manner as indices in a
	':ref:`getelementptr <i_getelementptr>`' operation. At least one index
	value must be specified.
	``OPCODE (LHS, RHS)``
	Perform the specified operation of the LHS and RHS constants. OPCODE
	may be any of the :ref:`binary <binaryops>` or :ref:`bitwise
	binary <bitwiseops>` operations. The constraints on operands are
	the same as those for the corresponding instruction (e.g. no bitwise
	operations on floating point values are allowed).

	Other Values
	============

	.. _inlineasmexprs:

	Inline Assembler Expressions
	----------------------------

	LLVM supports inline assembler expressions (as opposed to :ref:`Module-Level
	Inline Assembly <moduleasm>`) through the use of a special value. This value
	represents the inline assembler as a template string (containing the
	instructions to emit), a list of operand constraints (stored as a string), a
	flag that indicates whether or not the inline asm expression has side effects,
	and a flag indicating whether the function containing the asm needs to align its
	stack conservatively.

	The template string supports argument substitution of the operands using "``$``"
	followed by a number, to indicate substitution of the given register/memory
	location, as specified by the constraint string. "``${NUM:MODIFIER}``" may also
	be used, where ``MODIFIER`` is a target-specific annotation for how to print the
	operand (See :ref:`inline-asm-modifiers`).

	A literal "``$``" may be included by using "``$$``" in the template. To include
	other special characters into the output, the usual "``\XX``" escapes may be
	used, just as in other strings. Note that after template substitution, the
	resulting assembly string is parsed by LLVM's integrated assembler unless it is
	disabled -- even when emitting a ``.s`` file -- and thus must contain assembly
	syntax known to LLVM.

	LLVM also supports a few more substitions useful for writing inline assembly:

	- ``${:uid}``: Expands to a decimal integer unique to this inline assembly blob.
	This substitution is useful when declaring a local label. Many standard
	compiler optimizations, such as inlining, may duplicate an inline asm blob.
	Adding a blob-unique identifier ensures that the two labels will not conflict
	during assembly. This is used to implement `GCC's %= special format
	string <https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html>`_.
	- ``${:comment}``: Expands to the comment character of the current target's
	assembly dialect. This is usually ``#``, but many targets use other strings,
	such as ``;``, ``//``, or ``!``.
	- ``${:private}``: Expands to the assembler private label prefix. Labels with
	this prefix will not appear in the symbol table of the assembled object.
	Typically the prefix is ``L``, but targets may use other strings. ``.L`` is
	relatively popular.

	LLVM's support for inline asm is modeled closely on the requirements of Clang's
	GCC-compatible inline-asm support. Thus, the feature-set and the constraint and
	modifier codes listed here are similar or identical to those in GCC's inline asm
	support. However, to be clear, the syntax of the template and constraint strings
	described here is not the same as the syntax accepted by GCC and Clang, and,
	while most constraint letters are passed through as-is by Clang, some get
	translated to other codes when converting from the C source to the LLVM
	assembly.

	An example inline assembler expression is:

	.. code-block:: llvm

	i32 (i32) asm "bswap $0", "=r,r"

	Inline assembler expressions may only be used as the callee operand
	of a :ref:`call <i_call>` or an :ref:`invoke <i_invoke>` instruction.
	Thus, typically we have:

	.. code-block:: llvm

	%X = call i32 asm "bswap $0", "=r,r"(i32 %Y)

	Inline asms with side effects not visible in the constraint list must be
	marked as having side effects. This is done through the use of the
	'``sideeffect``' keyword, like so:

	.. code-block:: llvm

	call void asm sideeffect "eieio", ""()

	In some cases inline asms will contain code that will not work unless
	the stack is aligned in some way, such as calls or SSE instructions on
	x86, yet will not contain code that does that alignment within the asm.
	The compiler should make conservative assumptions about what the asm
	might contain and should generate its usual stack alignment code in the
	prologue if the '``alignstack``' keyword is present:

	.. code-block:: llvm

	call void asm alignstack "eieio", ""()

	Inline asms also support using non-standard assembly dialects. The
	assumed dialect is ATT. When the '``inteldialect``' keyword is present,
	the inline asm is using the Intel dialect. Currently, ATT and Intel are
	the only supported dialects. An example is:

	.. code-block:: llvm

	call void asm inteldialect "eieio", ""()

	If multiple keywords appear the '``sideeffect``' keyword must come
	first, the '``alignstack``' keyword second and the '``inteldialect``'
	keyword last.

	Inline Asm Constraint String
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	The constraint list is a comma-separated string, each element containing one or
	more constraint codes.

	For each element in the constraint list an appropriate register or memory
	operand will be chosen, and it will be made available to assembly template
	string expansion as ``$0`` for the first constraint in the list, ``$1`` for the
	second, etc.

	There are three different types of constraints, which are distinguished by a
	prefix symbol in front of the constraint code: Output, Input, and Clobber. The
	constraints must always be given in that order: outputs first, then inputs, then
	clobbers. They cannot be intermingled.

	There are also three different categories of constraint codes:

	- Register constraint. This is either a register class, or a fixed physical
	register. This kind of constraint will allocate a register, and if necessary,
	bitcast the argument or result to the appropriate type.
	- Memory constraint. This kind of constraint is for use with an instruction
	taking a memory operand. Different constraints allow for different addressing
	modes used by the target.
	- Immediate value constraint. This kind of constraint is for an integer or other
	immediate value which can be rendered directly into an instruction. The
	various target-specific constraints allow the selection of a value in the
	proper range for the instruction you wish to use it with.

	Output constraints
	""""""""""""""""""

	Output constraints are specified by an "``=``" prefix (e.g. "``=r``"). This
	indicates that the assembly will write to this operand, and the operand will
	then be made available as a return value of the ``asm`` expression. Output
	constraints do not consume an argument from the call instruction. (Except, see
	below about indirect outputs).

	Normally, it is expected that no output locations are written to by the assembly
	expression until all of the inputs have been read. As such, LLVM may assign
	the same register to an output and an input. If this is not safe (e.g. if the
	assembly contains two instructions, where the first writes to one output, and
	the second reads an input and writes to a second output), then the "``&``"
	modifier must be used (e.g. "``=&r``") to specify that the output is an
	"early-clobber" output. Marking an output as "early-clobber" ensures that LLVM
	will not use the same register for any inputs (other than an input tied to this
	output).

	Input constraints
	"""""""""""""""""

	Input constraints do not have a prefix -- just the constraint codes. Each input
	constraint will consume one argument from the call instruction. It is not
	permitted for the asm to write to any input register or memory location (unless
	that input is tied to an output). Note also that multiple inputs may all be
	assigned to the same register, if LLVM can determine that they necessarily all
	contain the same value.

	Instead of providing a Constraint Code, input constraints may also "tie"
	themselves to an output constraint, by providing an integer as the constraint
	string. Tied inputs still consume an argument from the call instruction, and
	take up a position in the asm template numbering as is usual -- they will simply
	be constrained to always use the same register as the output they've been tied
	to. For example, a constraint string of "``=r,0``" says to assign a register for
	output, and use that register as an input as well (it being the 0'th
	constraint).

	It is permitted to tie an input to an "early-clobber" output. In that case, no
	other input may share the same register as the input tied to the early-clobber
	(even when the other input has the same value).

	You may only tie an input to an output which has a register constraint, not a
	memory constraint. Only a single input may be tied to an output.

	There is also an "interesting" feature which deserves a bit of explanation: if a
	register class constraint allocates a register which is too small for the value
	type operand provided as input, the input value will be split into multiple
	registers, and all of them passed to the inline asm.

	However, this feature is often not as useful as you might think.

	Firstly, the registers are not guaranteed to be consecutive. So, on those
	architectures that have instructions which operate on multiple consecutive
	instructions, this is not an appropriate way to support them. (e.g. the 32-bit
	SparcV8 has a 64-bit load, which instruction takes a single 32-bit register. The
	hardware then loads into both the named register, and the next register. This
	feature of inline asm would not be useful to support that.)

	A few of the targets provide a template string modifier allowing explicit access
	to the second register of a two-register operand (e.g. MIPS ``L``, ``M``, and
	``D``). On such an architecture, you can actually access the second allocated
	register (yet, still, not any subsequent ones). But, in that case, you're still
	probably better off simply splitting the value into two separate operands, for
	clarity. (e.g. see the description of the ``A`` constraint on X86, which,
	despite existing only for use with this feature, is not really a good idea to
	use)

	Indirect inputs and outputs
	"""""""""""""""""""""""""""

	Indirect output or input constraints can be specified by the "``*``" modifier
	(which goes after the "``=``" in case of an output). This indicates that the asm
	will write to or read from the contents of an address provided as an input
	argument. (Note that in this way, indirect outputs act more like an input than
	an output: just like an input, they consume an argument of the call expression,
	rather than producing a return value. An indirect output constraint is an
	"output" only in that the asm is expected to write to the contents of the input
	memory location, instead of just read from it).

	This is most typically used for memory constraint, e.g. "``=*m``", to pass the
	address of a variable as a value.

	It is also possible to use an indirect register constraint, but only on output
	(e.g. "``=*r``"). This will cause LLVM to allocate a register for an output
	value normally, and then, separately emit a store to the address provided as
	input, after the provided inline asm. (It's not clear what value this
	functionality provides, compared to writing the store explicitly after the asm
	statement, and it can only produce worse code, since it bypasses many
	optimization passes. I would recommend not using it.)


	Clobber constraints
	"""""""""""""""""""

	A clobber constraint is indicated by a "``~``" prefix. A clobber does not
	consume an input operand, nor generate an output. Clobbers cannot use any of the
	general constraint code letters -- they may use only explicit register
	constraints, e.g. "``~{eax}``". The one exception is that a clobber string of
	"``~{memory}``" indicates that the assembly writes to arbitrary undeclared
	memory locations -- not only the memory pointed to by a declared indirect
	output.

	Note that clobbering named registers that are also present in output
	constraints is not legal.


	Constraint Codes
	""""""""""""""""
	After a potential prefix comes constraint code, or codes.

	A Constraint Code is either a single letter (e.g. "``r``"), a "``^``" character
	followed by two letters (e.g. "``^wc``"), or "``{``" register-name "``}``"
	(e.g. "``{eax}``").

	The one and two letter constraint codes are typically chosen to be the same as
	GCC's constraint codes.

	A single constraint may include one or more than constraint code in it, leaving
	it up to LLVM to choose which one to use. This is included mainly for
	compatibility with the translation of GCC inline asm coming from clang.

	There are two ways to specify alternatives, and either or both may be used in an
	inline asm constraint list:

	1) Append the codes to each other, making a constraint code set. E.g. "``im``"
	or "``{eax}m``". This means "choose any of the options in the set". The
	choice of constraint is made independently for each constraint in the
	constraint list.

	2) Use "``\|``" between constraint code sets, creating alternatives. Every
	constraint in the constraint list must have the same number of alternative
	sets. With this syntax, the same alternative in all of the items in the
	constraint list will be chosen together.

	Putting those together, you might have a two operand constraint string like
	``"rm\|r,ri\|rm"``. This indicates that if operand 0 is ``r`` or ``m``, then
	operand 1 may be one of ``r`` or ``i``. If operand 0 is ``r``, then operand 1
	may be one of ``r`` or ``m``. But, operand 0 and 1 cannot both be of type m.

	However, the use of either of the alternatives features is NOT recommended, as
	LLVM is not able to make an intelligent choice about which one to use. (At the
	point it currently needs to choose, not enough information is available to do so
	in a smart way.) Thus, it simply tries to make a choice that's most likely to
	compile, not one that will be optimal performance. (e.g., given "``rm``", it'll
	always choose to use memory, not registers). And, if given multiple registers,
	or multiple register classes, it will simply choose the first one. (In fact, it
	doesn't currently even ensure explicitly specified physical registers are
	unique, so specifying multiple physical registers as alternatives, like
	``{r11}{r12},{r11}{r12}``, will assign r11 to both operands, not at all what was
	intended.)

	Supported Constraint Code List
	""""""""""""""""""""""""""""""

	The constraint codes are, in general, expected to behave the same way they do in
	GCC. LLVM's support is often implemented on an 'as-needed' basis, to support C
	inline asm code which was supported by GCC. A mismatch in behavior between LLVM
	and GCC likely indicates a bug in LLVM.

	Some constraint codes are typically supported by all targets:

	- ``r``: A register in the target's general purpose register class.
	- ``m``: A memory address operand. It is target-specific what addressing modes
	are supported, typical examples are register, or register + register offset,
	or register + immediate offset (of some target-specific size).
	- ``i``: An integer constant (of target-specific width). Allows either a simple
	immediate, or a relocatable value.
	- ``n``: An integer constant -- not including relocatable values.
	- ``s``: An integer constant, but allowing only relocatable values.
	- ``X``: Allows an operand of any kind, no constraint whatsoever. Typically
	useful to pass a label for an asm branch or call.

	.. FIXME: but that surely isn't actually okay to jump out of an asm
	block without telling llvm about the control transfer???)

	- ``{register-name}``: Requires exactly the named physical register.

	Other constraints are target-specific:

	AArch64:

	- ``z``: An immediate integer 0. Outputs ``WZR`` or ``XZR``, as appropriate.
	- ``I``: An immediate integer valid for an ``ADD`` or ``SUB`` instruction,
	i.e. 0 to 4095 with optional shift by 12.
	- ``J``: An immediate integer that, when negated, is valid for an ``ADD`` or
	``SUB`` instruction, i.e. -1 to -4095 with optional left shift by 12.
	- ``K``: An immediate integer that is valid for the 'bitmask immediate 32' of a
	logical instruction like ``AND``, ``EOR``, or ``ORR`` with a 32-bit register.
	- ``L``: An immediate integer that is valid for the 'bitmask immediate 64' of a
	logical instruction like ``AND``, ``EOR``, or ``ORR`` with a 64-bit register.
	- ``M``: An immediate integer for use with the ``MOV`` assembly alias on a
	32-bit register. This is a superset of ``K``: in addition to the bitmask
	immediate, also allows immediate integers which can be loaded with a single
	``MOVZ`` or ``MOVL`` instruction.
	- ``N``: An immediate integer for use with the ``MOV`` assembly alias on a
	64-bit register. This is a superset of ``L``.
	- ``Q``: Memory address operand must be in a single register (no
	offsets). (However, LLVM currently does this for the ``m`` constraint as
	well.)
	- ``r``: A 32 or 64-bit integer register (W* or X*).
	- ``w``: A 32, 64, or 128-bit floating-point/SIMD register.
	- ``x``: A lower 128-bit floating-point/SIMD register (``V0`` to ``V15``).

	AMDGPU:

	- ``r``: A 32 or 64-bit integer register.
	- ``[0-9]v``: The 32-bit VGPR register, number 0-9.
	- ``[0-9]s``: The 32-bit SGPR register, number 0-9.


	All ARM modes:

	- ``Q``, ``Um``, ``Un``, ``Uq``, ``Us``, ``Ut``, ``Uv``, ``Uy``: Memory address
	operand. Treated the same as operand ``m``, at the moment.

	ARM and ARM's Thumb2 mode:

	- ``j``: An immediate integer between 0 and 65535 (valid for ``MOVW``)
	- ``I``: An immediate integer valid for a data-processing instruction.
	- ``J``: An immediate integer between -4095 and 4095.
	- ``K``: An immediate integer whose bitwise inverse is valid for a
	data-processing instruction. (Can be used with template modifier "``B``" to
	print the inverted value).
	- ``L``: An immediate integer whose negation is valid for a data-processing
	instruction. (Can be used with template modifier "``n``" to print the negated
	value).
	- ``M``: A power of two or a integer between 0 and 32.
	- ``N``: Invalid immediate constraint.
	- ``O``: Invalid immediate constraint.
	- ``r``: A general-purpose 32-bit integer register (``r0-r15``).
	- ``l``: In Thumb2 mode, low 32-bit GPR registers (``r0-r7``). In ARM mode, same
	as ``r``.
	- ``h``: In Thumb2 mode, a high 32-bit GPR register (``r8-r15``). In ARM mode,
	invalid.
	- ``w``: A 32, 64, or 128-bit floating-point/SIMD register: ``s0-s31``,
	``d0-d31``, or ``q0-q15``.
	- ``x``: A 32, 64, or 128-bit floating-point/SIMD register: ``s0-s15``,
	``d0-d7``, or ``q0-q3``.
	- ``t``: A floating-point/SIMD register, only supports 32-bit values:
	``s0-s31``.

	ARM's Thumb1 mode:

	- ``I``: An immediate integer between 0 and 255.
	- ``J``: An immediate integer between -255 and -1.
	- ``K``: An immediate integer between 0 and 255, with optional left-shift by
	some amount.
	- ``L``: An immediate integer between -7 and 7.
	- ``M``: An immediate integer which is a multiple of 4 between 0 and 1020.
	- ``N``: An immediate integer between 0 and 31.
	- ``O``: An immediate integer which is a multiple of 4 between -508 and 508.
	- ``r``: A low 32-bit GPR register (``r0-r7``).
	- ``l``: A low 32-bit GPR register (``r0-r7``).
	- ``h``: A high GPR register (``r0-r7``).
	- ``w``: A 32, 64, or 128-bit floating-point/SIMD register: ``s0-s31``,
	``d0-d31``, or ``q0-q15``.
	- ``x``: A 32, 64, or 128-bit floating-point/SIMD register: ``s0-s15``,
	``d0-d7``, or ``q0-q3``.
	- ``t``: A floating-point/SIMD register, only supports 32-bit values:
	``s0-s31``.


	Hexagon:

	- ``o``, ``v``: A memory address operand, treated the same as constraint ``m``,
	at the moment.
	- ``r``: A 32 or 64-bit register.

	MSP430:

	- ``r``: An 8 or 16-bit register.

	MIPS:

	- ``I``: An immediate signed 16-bit integer.
	- ``J``: An immediate integer zero.
	- ``K``: An immediate unsigned 16-bit integer.
	- ``L``: An immediate 32-bit integer, where the lower 16 bits are 0.
	- ``N``: An immediate integer between -65535 and -1.
	- ``O``: An immediate signed 15-bit integer.
	- ``P``: An immediate integer between 1 and 65535.
	- ``m``: A memory address operand. In MIPS-SE mode, allows a base address
	register plus 16-bit immediate offset. In MIPS mode, just a base register.
	- ``R``: A memory address operand. In MIPS-SE mode, allows a base address
	register plus a 9-bit signed offset. In MIPS mode, the same as constraint
	``m``.
	- ``ZC``: A memory address operand, suitable for use in a ``pref``, ``ll``, or
	``sc`` instruction on the given subtarget (details vary).
	- ``r``, ``d``, ``y``: A 32 or 64-bit GPR register.
	- ``f``: A 32 or 64-bit FPU register (``F0-F31``), or a 128-bit MSA register
	(``W0-W31``). In the case of MSA registers, it is recommended to use the ``w``
	argument modifier for compatibility with GCC.
	- ``c``: A 32-bit or 64-bit GPR register suitable for indirect jump (always
	``25``).
	- ``l``: The ``lo`` register, 32 or 64-bit.
	- ``x``: Invalid.

	NVPTX:

	- ``b``: A 1-bit integer register.
	- ``c`` or ``h``: A 16-bit integer register.
	- ``r``: A 32-bit integer register.
	- ``l`` or ``N``: A 64-bit integer register.
	- ``f``: A 32-bit float register.
	- ``d``: A 64-bit float register.


	PowerPC:

	- ``I``: An immediate signed 16-bit integer.
	- ``J``: An immediate unsigned 16-bit integer, shifted left 16 bits.
	- ``K``: An immediate unsigned 16-bit integer.
	- ``L``: An immediate signed 16-bit integer, shifted left 16 bits.
	- ``M``: An immediate integer greater than 31.
	- ``N``: An immediate integer that is an exact power of 2.
	- ``O``: The immediate integer constant 0.
	- ``P``: An immediate integer constant whose negation is a signed 16-bit
	constant.
	- ``es``, ``o``, ``Q``, ``Z``, ``Zy``: A memory address operand, currently
	treated the same as ``m``.
	- ``r``: A 32 or 64-bit integer register.
	- ``b``: A 32 or 64-bit integer register, excluding ``R0`` (that is:
	``R1-R31``).
	- ``f``: A 32 or 64-bit float register (``F0-F31``), or when QPX is enabled, a
	128 or 256-bit QPX register (``Q0-Q31``; aliases the ``F`` registers).
	- ``v``: For ``4 x f32`` or ``4 x f64`` types, when QPX is enabled, a
	128 or 256-bit QPX register (``Q0-Q31``), otherwise a 128-bit
	altivec vector register (``V0-V31``).

	.. FIXME: is this a bug that v accepts QPX registers? I think this
	is supposed to only use the altivec vector registers?

	- ``y``: Condition register (``CR0-CR7``).
	- ``wc``: An individual CR bit in a CR register.
	- ``wa``, ``wd``, ``wf``: Any 128-bit VSX vector register, from the full VSX
	register set (overlapping both the floating-point and vector register files).
	- ``ws``: A 32 or 64-bit floating point register, from the full VSX register
	set.

	Sparc:

	- ``I``: An immediate 13-bit signed integer.
	- ``r``: A 32-bit integer register.
	- ``f``: Any floating-point register on SparcV8, or a floating point
	register in the "low" half of the registers on SparcV9.
	- ``e``: Any floating point register. (Same as ``f`` on SparcV8.)

	SystemZ:

	- ``I``: An immediate unsigned 8-bit integer.
	- ``J``: An immediate unsigned 12-bit integer.
	- ``K``: An immediate signed 16-bit integer.
	- ``L``: An immediate signed 20-bit integer.
	- ``M``: An immediate integer 0x7fffffff.
	- ``Q``: A memory address operand with a base address and a 12-bit immediate
	unsigned displacement.
	- ``R``: A memory address operand with a base address, a 12-bit immediate
	unsigned displacement, and an index register.
	- ``S``: A memory address operand with a base address and a 20-bit immediate
	signed displacement.
	- ``T``: A memory address operand with a base address, a 20-bit immediate
	signed displacement, and an index register.
	- ``r`` or ``d``: A 32, 64, or 128-bit integer register.
	- ``a``: A 32, 64, or 128-bit integer address register (excludes R0, which in an
	address context evaluates as zero).
	- ``h``: A 32-bit value in the high part of a 64bit data register
	(LLVM-specific)
	- ``f``: A 32, 64, or 128-bit floating point register.

	X86:

	- ``I``: An immediate integer between 0 and 31.
	- ``J``: An immediate integer between 0 and 64.
	- ``K``: An immediate signed 8-bit integer.
	- ``L``: An immediate integer, 0xff or 0xffff or (in 64-bit mode only)
	0xffffffff.
	- ``M``: An immediate integer between 0 and 3.
	- ``N``: An immediate unsigned 8-bit integer.
	- ``O``: An immediate integer between 0 and 127.
	- ``e``: An immediate 32-bit signed integer.
	- ``Z``: An immediate 32-bit unsigned integer.
	- ``o``, ``v``: Treated the same as ``m``, at the moment.
	- ``q``: An 8, 16, 32, or 64-bit register which can be accessed as an 8-bit
	``l`` integer register. On X86-32, this is the ``a``, ``b``, ``c``, and ``d``
	registers, and on X86-64, it is all of the integer registers.
	- ``Q``: An 8, 16, 32, or 64-bit register which can be accessed as an 8-bit
	``h`` integer register. This is the ``a``, ``b``, ``c``, and ``d`` registers.
	- ``r`` or ``l``: An 8, 16, 32, or 64-bit integer register.
	- ``R``: An 8, 16, 32, or 64-bit "legacy" integer register -- one which has
	existed since i386, and can be accessed without the REX prefix.
	- ``f``: A 32, 64, or 80-bit '387 FPU stack pseudo-register.
	- ``y``: A 64-bit MMX register, if MMX is enabled.
	- ``x``: If SSE is enabled: a 32 or 64-bit scalar operand, or 128-bit vector
	operand in a SSE register. If AVX is also enabled, can also be a 256-bit
	vector operand in an AVX register. If AVX-512 is also enabled, can also be a
	512-bit vector operand in an AVX512 register, Otherwise, an error.
	- ``Y``: The same as ``x``, if SSE2 is enabled, otherwise an error.
	- ``A``: Special case: allocates EAX first, then EDX, for a single operand (in
	32-bit mode, a 64-bit integer operand will get split into two registers). It
	is not recommended to use this constraint, as in 64-bit mode, the 64-bit
	operand will get allocated only to RAX -- if two 32-bit operands are needed,
	you're better off splitting it yourself, before passing it to the asm
	statement.

	XCore:

	- ``r``: A 32-bit integer register.


	.. _inline-asm-modifiers:

	Asm template argument modifiers
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	In the asm template string, modifiers can be used on the operand reference, like
	"``${0:n}``".

	The modifiers are, in general, expected to behave the same way they do in
	GCC. LLVM's support is often implemented on an 'as-needed' basis, to support C
	inline asm code which was supported by GCC. A mismatch in behavior between LLVM
	and GCC likely indicates a bug in LLVM.

	Target-independent:

	- ``c``: Print an immediate integer constant unadorned, without
	the target-specific immediate punctuation (e.g. no ``$`` prefix).
	- ``n``: Negate and print immediate integer constant unadorned, without the
	target-specific immediate punctuation (e.g. no ``$`` prefix).
	- ``l``: Print as an unadorned label, without the target-specific label
	punctuation (e.g. no ``$`` prefix).

	AArch64:

	- ``w``: Print a GPR register with a ``w`` name instead of ``x`` name. E.g.,
	instead of ``x30``, print ``w30``.
	- ``x``: Print a GPR register with a ``x*`` name. (this is the default, anyhow).
	- ``b``, ``h``, ``s``, ``d``, ``q``: Print a floating-point/SIMD register with a
	``b``, ``h``, ``s``, ``d``, or ``q*`` name, rather than the default of
	``v*``.

	AMDGPU:

	- ``r``: No effect.

	ARM:

	- ``a``: Print an operand as an address (with ``[`` and ``]`` surrounding a
	register).
	- ``P``: No effect.
	- ``q``: No effect.
	- ``y``: Print a VFP single-precision register as an indexed double (e.g. print
	as ``d4[1]`` instead of ``s9``)
	- ``B``: Bitwise invert and print an immediate integer constant without ``#``
	prefix.
	- ``L``: Print the low 16-bits of an immediate integer constant.
	- ``M``: Print as a register set suitable for ldm/stm. Also prints all
	register operands subsequent to the specified one (!), so use carefully.
	- ``Q``: Print the low-order register of a register-pair, or the low-order
	register of a two-register operand.
	- ``R``: Print the high-order register of a register-pair, or the high-order
	register of a two-register operand.
	- ``H``: Print the second register of a register-pair. (On a big-endian system,
	``H`` is equivalent to ``Q``, and on little-endian system, ``H`` is equivalent
	to ``R``.)

	.. FIXME: H doesn't currently support printing the second register
	of a two-register operand.

	- ``e``: Print the low doubleword register of a NEON quad register.
	- ``f``: Print the high doubleword register of a NEON quad register.
	- ``m``: Print the base register of a memory operand without the ``[`` and ``]``
	adornment.

	Hexagon:

	- ``L``: Print the second register of a two-register operand. Requires that it
	has been allocated consecutively to the first.

	.. FIXME: why is it restricted to consecutive ones? And there's
	nothing that ensures that happens, is there?

	- ``I``: Print the letter 'i' if the operand is an integer constant, otherwise
	nothing. Used to print 'addi' vs 'add' instructions.

	MSP430:

	No additional modifiers.

	MIPS:

	- ``X``: Print an immediate integer as hexadecimal
	- ``x``: Print the low 16 bits of an immediate integer as hexadecimal.
	- ``d``: Print an immediate integer as decimal.
	- ``m``: Subtract one and print an immediate integer as decimal.
	- ``z``: Print $0 if an immediate zero, otherwise print normally.
	- ``L``: Print the low-order register of a two-register operand, or prints the
	address of the low-order word of a double-word memory operand.

	.. FIXME: L seems to be missing memory operand support.

	- ``M``: Print the high-order register of a two-register operand, or prints the
	address of the high-order word of a double-word memory operand.

	.. FIXME: M seems to be missing memory operand support.

	- ``D``: Print the second register of a two-register operand, or prints the
	second word of a double-word memory operand. (On a big-endian system, ``D`` is
	equivalent to ``L``, and on little-endian system, ``D`` is equivalent to
	``M``.)
	- ``w``: No effect. Provided for compatibility with GCC which requires this
	modifier in order to print MSA registers (``W0-W31``) with the ``f``
	constraint.

	NVPTX:

	- ``r``: No effect.

	PowerPC:

	- ``L``: Print the second register of a two-register operand. Requires that it
	has been allocated consecutively to the first.

	.. FIXME: why is it restricted to consecutive ones? And there's
	nothing that ensures that happens, is there?

	- ``I``: Print the letter 'i' if the operand is an integer constant, otherwise
	nothing. Used to print 'addi' vs 'add' instructions.
	- ``y``: For a memory operand, prints formatter for a two-register X-form
	instruction. (Currently always prints ``r0,OPERAND``).
	- ``U``: Prints 'u' if the memory operand is an update form, and nothing
	otherwise. (NOTE: LLVM does not support update form, so this will currently
	always print nothing)
	- ``X``: Prints 'x' if the memory operand is an indexed form. (NOTE: LLVM does
	not support indexed form, so this will currently always print nothing)

	Sparc:

	- ``r``: No effect.

	SystemZ:

	SystemZ implements only ``n``, and does not support any of the other
	target-independent modifiers.

	X86:

	- ``c``: Print an unadorned integer or symbol name. (The latter is
	target-specific behavior for this typically target-independent modifier).
	- ``A``: Print a register name with a '``*``' before it.
	- ``b``: Print an 8-bit register name (e.g. ``al``); do nothing on a memory
	operand.
	- ``h``: Print the upper 8-bit register name (e.g. ``ah``); do nothing on a
	memory operand.
	- ``w``: Print the 16-bit register name (e.g. ``ax``); do nothing on a memory
	operand.
	- ``k``: Print the 32-bit register name (e.g. ``eax``); do nothing on a memory
	operand.
	- ``q``: Print the 64-bit register name (e.g. ``rax``), if 64-bit registers are
	available, otherwise the 32-bit register name; do nothing on a memory operand.
	- ``n``: Negate and print an unadorned integer, or, for operands other than an
	immediate integer (e.g. a relocatable symbol expression), print a '-' before
	the operand. (The behavior for relocatable symbol expressions is a
	target-specific behavior for this typically target-independent modifier)
	- ``H``: Print a memory reference with additional offset +8.
	- ``P``: Print a memory reference or operand for use as the argument of a call
	instruction. (E.g. omit ``(rip)``, even though it's PC-relative.)

	XCore:

	No additional modifiers.


	Inline Asm Metadata
	^^^^^^^^^^^^^^^^^^^

	The call instructions that wrap inline asm nodes may have a
	"``!srcloc``" MDNode attached to it that contains a list of constant
	integers. If present, the code generator will use the integer as the
	location cookie value when report errors through the ``LLVMContext``
	error reporting mechanisms. This allows a front-end to correlate backend
	errors that occur with inline asm back to the source code that produced
	it. For example:

	.. code-block:: llvm

	call void asm sideeffect "something bad", ""(), !srcloc !42
	...
	!42 = !{ i32 1234567 }

	It is up to the front-end to make sense of the magic numbers it places
	in the IR. If the MDNode contains multiple constants, the code generator
	will use the one that corresponds to the line of the asm that the error
	occurs on.

	.. _metadata:

	Metadata
	========

	LLVM IR allows metadata to be attached to instructions in the program
	that can convey extra information about the code to the optimizers and
	code generator. One example application of metadata is source-level
	debug information. There are two metadata primitives: strings and nodes.

	Metadata does not have a type, and is not a value. If referenced from a
	``call`` instruction, it uses the ``metadata`` type.

	All metadata are identified in syntax by a exclamation point ('``!``').

	.. _metadata-string:

	Metadata Nodes and Metadata Strings
	-----------------------------------

	A metadata string is a string surrounded by double quotes. It can
	contain any character by escaping non-printable characters with
	"``\xx``" where "``xx``" is the two digit hex code. For example:
	"``!"test\00"``".

	Metadata nodes are represented with notation similar to structure
	constants (a comma separated list of elements, surrounded by braces and
	preceded by an exclamation point). Metadata nodes can have any values as
	their operand. For example:

	.. code-block:: llvm

	!{ !"test\00", i32 10}

	Metadata nodes that aren't uniqued use the ``distinct`` keyword. For example:

	.. code-block:: text

	!0 = distinct !{!"test\00", i32 10}

	``distinct`` nodes are useful when nodes shouldn't be merged based on their
	content. They can also occur when transformations cause uniquing collisions
	when metadata operands change.

	A :ref:`named metadata <namedmetadatastructure>` is a collection of
	metadata nodes, which can be looked up in the module symbol table. For
	example:

	.. code-block:: llvm

	!foo = !{!4, !3}

	Metadata can be used as function arguments. Here ``llvm.dbg.value``
	function is using two metadata arguments:

	.. code-block:: llvm

	call void @llvm.dbg.value(metadata !24, i64 0, metadata !25)

	Metadata can be attached to an instruction. Here metadata ``!21`` is attached
	to the ``add`` instruction using the ``!dbg`` identifier:

	.. code-block:: llvm

	%indvar.next = add i64 %indvar, 1, !dbg !21

	Metadata can also be attached to a function or a global variable. Here metadata
	``!22`` is attached to the ``f1`` and ``f2 functions, and the globals ``g1``
	and ``g2`` using the ``!dbg`` identifier:

	.. code-block:: llvm

	declare !dbg !22 void @f1()
	define void @f2() !dbg !22 {
	ret void
	}

	@g1 = global i32 0, !dbg !22
	@g2 = external global i32, !dbg !22

	A transformation is required to drop any metadata attachment that it does not
	know or know it can't preserve. Currently there is an exception for metadata
	attachment to globals for ``!type`` and ``!absolute_symbol`` which can't be
	unconditionally dropped unless the global is itself deleted.

	Metadata attached to a module using named metadata may not be dropped, with
	the exception of debug metadata (named metadata with the name ``!llvm.dbg.*``).

	More information about specific metadata nodes recognized by the
	optimizers and code generator is found below.

	.. _specialized-metadata:

	Specialized Metadata Nodes
	^^^^^^^^^^^^^^^^^^^^^^^^^^

	Specialized metadata nodes are custom data structures in metadata (as opposed
	to generic tuples). Their fields are labelled, and can be specified in any
	order.

	These aren't inherently debug info centric, but currently all the specialized
	metadata nodes are related to debug info.

	.. _DICompileUnit:

	DICompileUnit
	"""""""""""""

	``DICompileUnit`` nodes represent a compile unit. The ``enums:``,
	``retainedTypes:``, ``globals:``, ``imports:`` and ``macros:`` fields are tuples
	containing the debug info to be emitted along with the compile unit, regardless
	of code optimizations (some nodes are only emitted if there are references to
	them from instructions). The ``debugInfoForProfiling:`` field is a boolean
	indicating whether or not line-table discriminators are updated to provide
	more-accurate debug info for profiling results.

	.. code-block:: text

	!0 = !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang",
	isOptimized: true, flags: "-O2", runtimeVersion: 2,
	splitDebugFilename: "abc.debug", emissionKind: FullDebug,
	enums: !2, retainedTypes: !3, globals: !4, imports: !5,
	macros: !6, dwoId: 0x0abcd)

	Compile unit descriptors provide the root scope for objects declared in a
	specific compilation unit. File descriptors are defined using this scope. These
	descriptors are collected by a named metadata node ``!llvm.dbg.cu``. They keep
	track of global variables, type information, and imported entities (declarations
	and namespaces).

	.. _DIFile:

	DIFile
	""""""

	``DIFile`` nodes represent files. The ``filename:`` can include slashes.

	.. code-block:: none

	!0 = !DIFile(filename: "path/to/file", directory: "/path/to/dir",
	checksumkind: CSK_MD5,
	checksum: "000102030405060708090a0b0c0d0e0f")

	Files are sometimes used in ``scope:`` fields, and are the only valid target
	for ``file:`` fields.
	Valid values for ``checksumkind:`` field are: {CSK_None, CSK_MD5, CSK_SHA1}

	.. _DIBasicType:

	DIBasicType
	"""""""""""

	``DIBasicType`` nodes represent primitive types, such as ``int``, ``bool`` and
	``float``. ``tag:`` defaults to ``DW_TAG_base_type``.

	.. code-block:: text

	!0 = !DIBasicType(name: "unsigned char", size: 8, align: 8,
	encoding: DW_ATE_unsigned_char)
	!1 = !DIBasicType(tag: DW_TAG_unspecified_type, name: "decltype(nullptr)")

	The ``encoding:`` describes the details of the type. Usually it's one of the
	following:

	.. code-block:: text

	DW_ATE_address = 1
	DW_ATE_boolean = 2
	DW_ATE_float = 4
	DW_ATE_signed = 5
	DW_ATE_signed_char = 6
	DW_ATE_unsigned = 7
	DW_ATE_unsigned_char = 8

	.. _DISubroutineType:

	DISubroutineType
	""""""""""""""""

	``DISubroutineType`` nodes represent subroutine types. Their ``types:`` field
	refers to a tuple; the first operand is the return type, while the rest are the
	types of the formal arguments in order. If the first operand is ``null``, that
	represents a function with no return value (such as ``void foo() {}`` in C++).

	.. code-block:: text

	!0 = !BasicType(name: "int", size: 32, align: 32, DW_ATE_signed)
	!1 = !BasicType(name: "char", size: 8, align: 8, DW_ATE_signed_char)
	!2 = !DISubroutineType(types: !{null, !0, !1}) ; void (int, char)

	.. _DIDerivedType:

	DIDerivedType
	"""""""""""""

	``DIDerivedType`` nodes represent types derived from other types, such as
	qualified types.

	.. code-block:: text

	!0 = !DIBasicType(name: "unsigned char", size: 8, align: 8,
	encoding: DW_ATE_unsigned_char)
	!1 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !0, size: 32,
	align: 32)

	The following ``tag:`` values are valid:

	.. code-block:: text

	DW_TAG_member = 13
	DW_TAG_pointer_type = 15
	DW_TAG_reference_type = 16
	DW_TAG_typedef = 22
	DW_TAG_inheritance = 28
	DW_TAG_ptr_to_member_type = 31
	DW_TAG_const_type = 38
	DW_TAG_friend = 42
	DW_TAG_volatile_type = 53
	DW_TAG_restrict_type = 55
	DW_TAG_atomic_type = 71

	.. _DIDerivedTypeMember:

	``DW_TAG_member`` is used to define a member of a :ref:`composite type
	<DICompositeType>`. The type of the member is the ``baseType:``. The
	``offset:`` is the member's bit offset. If the composite type has an ODR
	``identifier:`` and does not set ``flags: DIFwdDecl``, then the member is
	uniqued based only on its ``name:`` and ``scope:``.

	``DW_TAG_inheritance`` and ``DW_TAG_friend`` are used in the ``elements:``
	field of :ref:`composite types <DICompositeType>` to describe parents and
	friends.

	``DW_TAG_typedef`` is used to provide a name for the ``baseType:``.

	``DW_TAG_pointer_type``, ``DW_TAG_reference_type``, ``DW_TAG_const_type``,
	``DW_TAG_volatile_type``, ``DW_TAG_restrict_type`` and ``DW_TAG_atomic_type``
	are used to qualify the ``baseType:``.

	Note that the ``void *`` type is expressed as a type derived from NULL.

	.. _DICompositeType:

	DICompositeType
	"""""""""""""""

	``DICompositeType`` nodes represent types composed of other types, like
	structures and unions. ``elements:`` points to a tuple of the composed types.

	If the source language supports ODR, the ``identifier:`` field gives the unique
	identifier used for type merging between modules. When specified,
	:ref:`subprogram declarations <DISubprogramDeclaration>` and :ref:`member
	derived types <DIDerivedTypeMember>` that reference the ODR-type in their
	``scope:`` change uniquing rules.

	For a given ``identifier:``, there should only be a single composite type that
	does not have ``flags: DIFlagFwdDecl`` set. LLVM tools that link modules
	together will unique such definitions at parse time via the ``identifier:``
	field, even if the nodes are ``distinct``.

	.. code-block:: text

	!0 = !DIEnumerator(name: "SixKind", value: 7)
	!1 = !DIEnumerator(name: "SevenKind", value: 7)
	!2 = !DIEnumerator(name: "NegEightKind", value: -8)
	!3 = !DICompositeType(tag: DW_TAG_enumeration_type, name: "Enum", file: !12,
	line: 2, size: 32, align: 32, identifier: "_M4Enum",
	elements: !{!0, !1, !2})

	The following ``tag:`` values are valid:

	.. code-block:: text

	DW_TAG_array_type = 1
	DW_TAG_class_type = 2
	DW_TAG_enumeration_type = 4
	DW_TAG_structure_type = 19
	DW_TAG_union_type = 23

	For ``DW_TAG_array_type``, the ``elements:`` should be :ref:`subrange
	descriptors <DISubrange>`, each representing the range of subscripts at that
	level of indexing. The ``DIFlagVector`` flag to ``flags:`` indicates that an
	array type is a native packed vector.

	For ``DW_TAG_enumeration_type``, the ``elements:`` should be :ref:`enumerator
	descriptors <DIEnumerator>`, each representing the definition of an enumeration
	value for the set. All enumeration type descriptors are collected in the
	``enums:`` field of the :ref:`compile unit <DICompileUnit>`.

	For ``DW_TAG_structure_type``, ``DW_TAG_class_type``, and
	``DW_TAG_union_type``, the ``elements:`` should be :ref:`derived types
	<DIDerivedType>` with ``tag: DW_TAG_member``, ``tag: DW_TAG_inheritance``, or
	``tag: DW_TAG_friend``; or :ref:`subprograms <DISubprogram>` with
	``isDefinition: false``.

	.. _DISubrange:

	DISubrange
	""""""""""

	``DISubrange`` nodes are the elements for ``DW_TAG_array_type`` variants of
	:ref:`DICompositeType`. ``count: -1`` indicates an empty array.

	.. code-block:: llvm

	!0 = !DISubrange(count: 5, lowerBound: 0) ; array counting from 0
	!1 = !DISubrange(count: 5, lowerBound: 1) ; array counting from 1
	!2 = !DISubrange(count: -1) ; empty array.

	.. _DIEnumerator:

	DIEnumerator
	""""""""""""

	``DIEnumerator`` nodes are the elements for ``DW_TAG_enumeration_type``
	variants of :ref:`DICompositeType`.

	.. code-block:: llvm

	!0 = !DIEnumerator(name: "SixKind", value: 7)
	!1 = !DIEnumerator(name: "SevenKind", value: 7)
	!2 = !DIEnumerator(name: "NegEightKind", value: -8)

	DITemplateTypeParameter
	"""""""""""""""""""""""

	``DITemplateTypeParameter`` nodes represent type parameters to generic source
	language constructs. They are used (optionally) in :ref:`DICompositeType` and
	:ref:`DISubprogram` ``templateParams:`` fields.

	.. code-block:: llvm

	!0 = !DITemplateTypeParameter(name: "Ty", type: !1)

	DITemplateValueParameter
	""""""""""""""""""""""""

	``DITemplateValueParameter`` nodes represent value parameters to generic source
	language constructs. ``tag:`` defaults to ``DW_TAG_template_value_parameter``,
	but if specified can also be set to ``DW_TAG_GNU_template_template_param`` or
	``DW_TAG_GNU_template_param_pack``. They are used (optionally) in
	:ref:`DICompositeType` and :ref:`DISubprogram` ``templateParams:`` fields.

	.. code-block:: llvm

	!0 = !DITemplateValueParameter(name: "Ty", type: !1, value: i32 7)

	DINamespace
	"""""""""""

	``DINamespace`` nodes represent namespaces in the source language.

	.. code-block:: llvm

	!0 = !DINamespace(name: "myawesomeproject", scope: !1, file: !2, line: 7)

	DIGlobalVariable
	""""""""""""""""

	``DIGlobalVariable`` nodes represent global variables in the source language.

	.. code-block:: llvm

	!0 = !DIGlobalVariable(name: "foo", linkageName: "foo", scope: !1,
	file: !2, line: 7, type: !3, isLocal: true,
	isDefinition: false, variable: i32* @foo,
	declaration: !4)

	All global variables should be referenced by the `globals:` field of a
	:ref:`compile unit <DICompileUnit>`.

	.. _DISubprogram:

	DISubprogram
	""""""""""""

	``DISubprogram`` nodes represent functions from the source language. A
	``DISubprogram`` may be attached to a function definition using ``!dbg``
	metadata. The ``variables:`` field points at :ref:`variables <DILocalVariable>`
	that must be retained, even if their IR counterparts are optimized out of
	the IR. The ``type:`` field must point at an :ref:`DISubroutineType`.

	.. _DISubprogramDeclaration:

	When ``isDefinition: false``, subprograms describe a declaration in the type
	tree as opposed to a definition of a function. If the scope is a composite
	type with an ODR ``identifier:`` and that does not set ``flags: DIFwdDecl``,
	then the subprogram declaration is uniqued based only on its ``linkageName:``
	and ``scope:``.

	.. code-block:: text

	define void @_Z3foov() !dbg !0 {
	...
	}

	!0 = distinct !DISubprogram(name: "foo", linkageName: "_Zfoov", scope: !1,
	file: !2, line: 7, type: !3, isLocal: true,
	isDefinition: true, scopeLine: 8,
	containingType: !4,
	virtuality: DW_VIRTUALITY_pure_virtual,
	virtualIndex: 10, flags: DIFlagPrototyped,
	isOptimized: true, unit: !5, templateParams: !6,
	declaration: !7, variables: !8, thrownTypes: !9)

	.. _DILexicalBlock:

	DILexicalBlock
	""""""""""""""

	``DILexicalBlock`` nodes describe nested blocks within a :ref:`subprogram
	<DISubprogram>`. The line number and column numbers are used to distinguish
	two lexical blocks at same depth. They are valid targets for ``scope:``
	fields.

	.. code-block:: text

	!0 = distinct !DILexicalBlock(scope: !1, file: !2, line: 7, column: 35)

	Usually lexical blocks are ``distinct`` to prevent node merging based on
	operands.

	.. _DILexicalBlockFile:

	DILexicalBlockFile
	""""""""""""""""""

	``DILexicalBlockFile`` nodes are used to discriminate between sections of a
	:ref:`lexical block <DILexicalBlock>`. The ``file:`` field can be changed to
	indicate textual inclusion, or the ``discriminator:`` field can be used to
	discriminate between control flow within a single block in the source language.

	.. code-block:: llvm

	!0 = !DILexicalBlock(scope: !3, file: !4, line: 7, column: 35)
	!1 = !DILexicalBlockFile(scope: !0, file: !4, discriminator: 0)
	!2 = !DILexicalBlockFile(scope: !0, file: !4, discriminator: 1)

	.. _DILocation:

	DILocation
	""""""""""

	``DILocation`` nodes represent source debug locations. The ``scope:`` field is
	mandatory, and points at an :ref:`DILexicalBlockFile`, an
	:ref:`DILexicalBlock`, or an :ref:`DISubprogram`.

	.. code-block:: llvm

	!0 = !DILocation(line: 2900, column: 42, scope: !1, inlinedAt: !2)

	.. _DILocalVariable:

	DILocalVariable
	"""""""""""""""

	``DILocalVariable`` nodes represent local variables in the source language. If
	the ``arg:`` field is set to non-zero, then this variable is a subprogram
	parameter, and it will be included in the ``variables:`` field of its
	:ref:`DISubprogram`.

	.. code-block:: text

	!0 = !DILocalVariable(name: "this", arg: 1, scope: !3, file: !2, line: 7,
	type: !3, flags: DIFlagArtificial)
	!1 = !DILocalVariable(name: "x", arg: 2, scope: !4, file: !2, line: 7,
	type: !3)
	!2 = !DILocalVariable(name: "y", scope: !5, file: !2, line: 7, type: !3)

	DIExpression
	""""""""""""

	``DIExpression`` nodes represent expressions that are inspired by the DWARF
	expression language. They are used in :ref:`debug intrinsics<dbg_intrinsics>`
	(such as ``llvm.dbg.declare`` and ``llvm.dbg.value``) to describe how the
	referenced LLVM variable relates to the source language variable.

	The current supported vocabulary is limited:

	- ``DW_OP_deref`` dereferences the top of the expression stack.
	- ``DW_OP_plus`` pops the last two entries from the expression stack, adds
	them together and appends the result to the expression stack.
	- ``DW_OP_minus`` pops the last two entries from the expression stack, subtracts
	the last entry from the second last entry and appends the result to the
	expression stack.
	- ``DW_OP_plus_uconst, 93`` adds ``93`` to the working expression.
	- ``DW_OP_LLVM_fragment, 16, 8`` specifies the offset and size (``16`` and ``8``
	here, respectively) of the variable fragment from the working expression. Note
	that contrary to DW_OP_bit_piece, the offset is describing the the location
	within the described source variable.
	- ``DW_OP_swap`` swaps top two stack entries.
	- ``DW_OP_xderef`` provides extended dereference mechanism. The entry at the top
	of the stack is treated as an address. The second stack entry is treated as an
	address space identifier.
	- ``DW_OP_stack_value`` marks a constant value.

	DWARF specifies three kinds of simple location descriptions: Register, memory,
	and implicit location descriptions. Register and memory location descriptions
	describe the location of a source variable (in the sense that a debugger might
	modify its value), whereas implicit locations describe merely the value of a
	source variable. DIExpressions also follow this model: A DIExpression that
	doesn't have a trailing ``DW_OP_stack_value`` will describe an address when
	combined with a concrete location.

	.. code-block:: llvm

	!0 = !DIExpression(DW_OP_deref)
	!1 = !DIExpression(DW_OP_plus_uconst, 3)
	!1 = !DIExpression(DW_OP_constu, 3, DW_OP_plus)
	!2 = !DIExpression(DW_OP_bit_piece, 3, 7)
	!3 = !DIExpression(DW_OP_deref, DW_OP_constu, 3, DW_OP_plus, DW_OP_LLVM_fragment, 3, 7)
	!4 = !DIExpression(DW_OP_constu, 2, DW_OP_swap, DW_OP_xderef)
	!5 = !DIExpression(DW_OP_constu, 42, DW_OP_stack_value)

	DIObjCProperty
	""""""""""""""

	``DIObjCProperty`` nodes represent Objective-C property nodes.

	.. code-block:: llvm

	!3 = !DIObjCProperty(name: "foo", file: !1, line: 7, setter: "setFoo",
	getter: "getFoo", attributes: 7, type: !2)

	DIImportedEntity
	""""""""""""""""

	``DIImportedEntity`` nodes represent entities (such as modules) imported into a
	compile unit.

	.. code-block:: text

	!2 = !DIImportedEntity(tag: DW_TAG_imported_module, name: "foo", scope: !0,
	entity: !1, line: 7)

	DIMacro
	"""""""

	``DIMacro`` nodes represent definition or undefinition of a macro identifiers.
	The ``name:`` field is the macro identifier, followed by macro parameters when
	defining a function-like macro, and the ``value`` field is the token-string
	used to expand the macro identifier.

	.. code-block:: text

	!2 = !DIMacro(macinfo: DW_MACINFO_define, line: 7, name: "foo(x)",
	value: "((x) + 1)")
	!3 = !DIMacro(macinfo: DW_MACINFO_undef, line: 30, name: "foo")

	DIMacroFile
	"""""""""""

	``DIMacroFile`` nodes represent inclusion of source files.
	The ``nodes:`` field is a list of ``DIMacro`` and ``DIMacroFile`` nodes that
	appear in the included source file.

	.. code-block:: text

	!2 = !DIMacroFile(macinfo: DW_MACINFO_start_file, line: 7, file: !2,
	nodes: !3)

	'``tbaa``' Metadata
	^^^^^^^^^^^^^^^^^^^

	In LLVM IR, memory does not have types, so LLVM's own type system is not
	suitable for doing type based alias analysis (TBAA). Instead, metadata is
	added to the IR to describe a type system of a higher level language. This
	can be used to implement C/C++ strict type aliasing rules, but it can also
	be used to implement custom alias analysis behavior for other languages.

	This description of LLVM's TBAA system is broken into two parts:
	:ref:`Semantics<tbaa_node_semantics>` talks about high level issues, and
	:ref:`Representation<tbaa_node_representation>` talks about the metadata
	encoding of various entities.

	It is always possible to trace any TBAA node to a "root" TBAA node (details
	in the :ref:`Representation<tbaa_node_representation>` section). TBAA
	nodes with different roots have an unknown aliasing relationship, and LLVM
	conservatively infers ``MayAlias`` between them. The rules mentioned in
	this section only pertain to TBAA nodes living under the same root.

	.. _tbaa_node_semantics:

	Semantics
	"""""""""

	The TBAA metadata system, referred to as "struct path TBAA" (not to be
	confused with ``tbaa.struct``), consists of the following high level
	concepts: Type Descriptors, further subdivided into scalar type
	descriptors and struct type descriptors; and Access Tags.

	Type descriptors describe the type system of the higher level language
	being compiled. Scalar type descriptors describe types that do not
	contain other types. Each scalar type has a parent type, which must also
	be a scalar type or the TBAA root. Via this parent relation, scalar types
	within a TBAA root form a tree. Struct type descriptors denote types
	that contain a sequence of other type descriptors, at known offsets. These
	contained type descriptors can either be struct type descriptors themselves
	or scalar type descriptors.

	Access tags are metadata nodes attached to load and store instructions.
	Access tags use type descriptors to describe the location being accessed
	in terms of the type system of the higher level language. Access tags are
	tuples consisting of a base type, an access type and an offset. The base
	type is a scalar type descriptor or a struct type descriptor, the access
	type is a scalar type descriptor, and the offset is a constant integer.

	The access tag ``(BaseTy, AccessTy, Offset)`` can describe one of two
	things:

	* If ``BaseTy`` is a struct type, the tag describes a memory access (load
	or store) of a value of type ``AccessTy`` contained in the struct type
	``BaseTy`` at offset ``Offset``.

	* If ``BaseTy`` is a scalar type, ``Offset`` must be 0 and ``BaseTy`` and
	``AccessTy`` must be the same; and the access tag describes a scalar
	access with scalar type ``AccessTy``.

	We first define an ``ImmediateParent`` relation on ``(BaseTy, Offset)``
	tuples this way:

	* If ``BaseTy`` is a scalar type then ``ImmediateParent(BaseTy, 0)`` is
	``(ParentTy, 0)`` where ``ParentTy`` is the parent of the scalar type as
	described in the TBAA metadata. ``ImmediateParent(BaseTy, Offset)`` is
	undefined if ``Offset`` is non-zero.

	* If ``BaseTy`` is a struct type then ``ImmediateParent(BaseTy, Offset)``
	is ``(NewTy, NewOffset)`` where ``NewTy`` is the type contained in
	``BaseTy`` at offset ``Offset`` and ``NewOffset`` is ``Offset`` adjusted
	to be relative within that inner type.

	A memory access with an access tag ``(BaseTy1, AccessTy1, Offset1)``
	aliases a memory access with an access tag ``(BaseTy2, AccessTy2,
	Offset2)`` if either ``(BaseTy1, Offset1)`` is reachable from ``(Base2,
	Offset2)`` via the ``Parent`` relation or vice versa.

	As a concrete example, the type descriptor graph for the following program

	.. code-block:: c

	struct Inner {
	int i; // offset 0
	float f; // offset 4
	};

	struct Outer {
	float f; // offset 0
	double d; // offset 4
	struct Inner inner_a; // offset 12
	};

	void f(struct Outer* outer, struct Inner* inner, float* f, int* i, char* c) {
	outer->f = 0; // tag0: (OuterStructTy, FloatScalarTy, 0)
	outer->inner_a.i = 0; // tag1: (OuterStructTy, IntScalarTy, 12)
	outer->inner_a.f = 0.0; // tag2: (OuterStructTy, IntScalarTy, 16)
	*f = 0.0; // tag3: (FloatScalarTy, FloatScalarTy, 0)
	}

	is (note that in C and C++, ``char`` can be used to access any arbitrary
	type):

	.. code-block:: text

	Root = "TBAA Root"
	CharScalarTy = ("char", Root, 0)
	FloatScalarTy = ("float", CharScalarTy, 0)
	DoubleScalarTy = ("double", CharScalarTy, 0)
	IntScalarTy = ("int", CharScalarTy, 0)
	InnerStructTy = {"Inner" (IntScalarTy, 0), (FloatScalarTy, 4)}
	OuterStructTy = {"Outer", (FloatScalarTy, 0), (DoubleScalarTy, 4),
	(InnerStructTy, 12)}


	with (e.g.) ``ImmediateParent(OuterStructTy, 12)`` = ``(InnerStructTy,
	0)``, ``ImmediateParent(InnerStructTy, 0)`` = ``(IntScalarTy, 0)``, and
	``ImmediateParent(IntScalarTy, 0)`` = ``(CharScalarTy, 0)``.

	.. _tbaa_node_representation:

	Representation
	""""""""""""""

	The root node of a TBAA type hierarchy is an ``MDNode`` with 0 operands or
	with exactly one ``MDString`` operand.

	Scalar type descriptors are represented as an ``MDNode`` s with two
	operands. The first operand is an ``MDString`` denoting the name of the
	struct type. LLVM does not assign meaning to the value of this operand, it
	only cares about it being an ``MDString``. The second operand is an
	``MDNode`` which points to the parent for said scalar type descriptor,
	which is either another scalar type descriptor or the TBAA root. Scalar
	type descriptors can have an optional third argument, but that must be the
	constant integer zero.

	Struct type descriptors are represented as ``MDNode`` s with an odd number
	of operands greater than 1. The first operand is an ``MDString`` denoting
	the name of the struct type. Like in scalar type descriptors the actual
	value of this name operand is irrelevant to LLVM. After the name operand,
	the struct type descriptors have a sequence of alternating ``MDNode`` and
	``ConstantInt`` operands. With N starting from 1, the 2N - 1 th operand,
	an ``MDNode``, denotes a contained field, and the 2N th operand, a
	``ConstantInt``, is the offset of the said contained field. The offsets
	must be in non-decreasing order.

	Access tags are represented as ``MDNode`` s with either 3 or 4 operands.
	The first operand is an ``MDNode`` pointing to the node representing the
	base type. The second operand is an ``MDNode`` pointing to the node
	representing the access type. The third operand is a ``ConstantInt`` that
	states the offset of the access. If a fourth field is present, it must be
	a ``ConstantInt`` valued at 0 or 1. If it is 1 then the access tag states
	that the location being accessed is "constant" (meaning
	``pointsToConstantMemory`` should return true; see `other useful
	AliasAnalysis methods <AliasAnalysis.html#OtherItfs>`_). The TBAA root of
	the access type and the base type of an access tag must be the same, and
	that is the TBAA root of the access tag.

	'``tbaa.struct``' Metadata
	^^^^^^^^^^^^^^^^^^^^^^^^^^

	The :ref:`llvm.memcpy <int_memcpy>` is often used to implement
	aggregate assignment operations in C and similar languages, however it
	is defined to copy a contiguous region of memory, which is more than
	strictly necessary for aggregate types which contain holes due to
	padding. Also, it doesn't contain any TBAA information about the fields
	of the aggregate.

	``!tbaa.struct`` metadata can describe which memory subregions in a
	memcpy are padding and what the TBAA tags of the struct are.

	The current metadata format is very simple. ``!tbaa.struct`` metadata
	nodes are a list of operands which are in conceptual groups of three.
	For each group of three, the first operand gives the byte offset of a
	field in bytes, the second gives its size in bytes, and the third gives
	its tbaa tag. e.g.:

	.. code-block:: llvm

	!4 = !{ i64 0, i64 4, !1, i64 8, i64 4, !2 }

	This describes a struct with two fields. The first is at offset 0 bytes
	with size 4 bytes, and has tbaa tag !1. The second is at offset 8 bytes
	and has size 4 bytes and has tbaa tag !2.

	Note that the fields need not be contiguous. In this example, there is a
	4 byte gap between the two fields. This gap represents padding which
	does not carry useful data and need not be preserved.

	'``noalias``' and '``alias.scope``' Metadata
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	``noalias`` and ``alias.scope`` metadata provide the ability to specify generic
	noalias memory-access sets. This means that some collection of memory access
	instructions (loads, stores, memory-accessing calls, etc.) that carry
	``noalias`` metadata can specifically be specified not to alias with some other
	collection of memory access instructions that carry ``alias.scope`` metadata.
	Each type of metadata specifies a list of scopes where each scope has an id and
	a domain.

	When evaluating an aliasing query, if for some domain, the set
	of scopes with that domain in one instruction's ``alias.scope`` list is a
	subset of (or equal to) the set of scopes for that domain in another
	instruction's ``noalias`` list, then the two memory accesses are assumed not to
	alias.

	Because scopes in one domain don't affect scopes in other domains, separate
	domains can be used to compose multiple independent noalias sets. This is
	used for example during inlining. As the noalias function parameters are
	turned into noalias scope metadata, a new domain is used every time the
	function is inlined.

	The metadata identifying each domain is itself a list containing one or two
	entries. The first entry is the name of the domain. Note that if the name is a
	string then it can be combined across functions and translation units. A
	self-reference can be used to create globally unique domain names. A
	descriptive string may optionally be provided as a second list entry.

	The metadata identifying each scope is also itself a list containing two or
	three entries. The first entry is the name of the scope. Note that if the name
	is a string then it can be combined across functions and translation units. A
	self-reference can be used to create globally unique scope names. A metadata
	reference to the scope's domain is the second entry. A descriptive string may
	optionally be provided as a third list entry.

	For example,

	.. code-block:: llvm

	; Two scope domains:
	!0 = !{!0}
	!1 = !{!1}

	; Some scopes in these domains:
	!2 = !{!2, !0}
	!3 = !{!3, !0}
	!4 = !{!4, !1}

	; Some scope lists:
	!5 = !{!4} ; A list containing only scope !4
	!6 = !{!4, !3, !2}
	!7 = !{!3}

	; These two instructions don't alias:
	%0 = load float, float* %c, align 4, !alias.scope !5
	store float %0, float* %arrayidx.i, align 4, !noalias !5

	; These two instructions also don't alias (for domain !1, the set of scopes
	; in the !alias.scope equals that in the !noalias list):
	%2 = load float, float* %c, align 4, !alias.scope !5
	store float %2, float* %arrayidx.i2, align 4, !noalias !6

	; These two instructions may alias (for domain !0, the set of scopes in
	; the !noalias list is not a superset of, or equal to, the scopes in the
	; !alias.scope list):
	%2 = load float, float* %c, align 4, !alias.scope !6
	store float %0, float* %arrayidx.i, align 4, !noalias !7

	'``fpmath``' Metadata
	^^^^^^^^^^^^^^^^^^^^^

	``fpmath`` metadata may be attached to any instruction of floating point
	type. It can be used to express the maximum acceptable error in the
	result of that instruction, in ULPs, thus potentially allowing the
	compiler to use a more efficient but less accurate method of computing
	it. ULP is defined as follows:

	If ``x`` is a real number that lies between two finite consecutive
	floating-point numbers ``a`` and ``b``, without being equal to one
	of them, then ``ulp(x) = \|b - a\|``, otherwise ``ulp(x)`` is the
	distance between the two non-equal finite floating-point numbers
	nearest ``x``. Moreover, ``ulp(NaN)`` is ``NaN``.

	The metadata node shall consist of a single positive float type number
	representing the maximum relative error, for example:

	.. code-block:: llvm

	!0 = !{ float 2.5 } ; maximum acceptable inaccuracy is 2.5 ULPs

	.. _range-metadata:

	'``range``' Metadata
	^^^^^^^^^^^^^^^^^^^^

	``range`` metadata may be attached only to ``load``, ``call`` and ``invoke`` of
	integer types. It expresses the possible ranges the loaded value or the value
	returned by the called function at this call site is in. The ranges are
	represented with a flattened list of integers. The loaded value or the value
	returned is known to be in the union of the ranges defined by each consecutive
	pair. Each pair has the following properties:

	- The type must match the type loaded by the instruction.
	- The pair ``a,b`` represents the range ``[a,b)``.
	- Both ``a`` and ``b`` are constants.
	- The range is allowed to wrap.
	- The range should not represent the full or empty set. That is,
	``a!=b``.

	In addition, the pairs must be in signed order of the lower bound and
	they must be non-contiguous.

	Examples:

	.. code-block:: llvm

	%a = load i8, i8* %x, align 1, !range !0 ; Can only be 0 or 1
	%b = load i8, i8* %y, align 1, !range !1 ; Can only be 255 (-1), 0 or 1
	%c = call i8 @foo(), !range !2 ; Can only be 0, 1, 3, 4 or 5
	%d = invoke i8 @bar() to label %cont
	unwind label %lpad, !range !3 ; Can only be -2, -1, 3, 4 or 5
	...
	!0 = !{ i8 0, i8 2 }
	!1 = !{ i8 255, i8 2 }
	!2 = !{ i8 0, i8 2, i8 3, i8 6 }
	!3 = !{ i8 -2, i8 0, i8 3, i8 6 }

	'``absolute_symbol``' Metadata
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	``absolute_symbol`` metadata may be attached to a global variable
	declaration. It marks the declaration as a reference to an absolute symbol,
	which causes the backend to use absolute relocations for the symbol even
	in position independent code, and expresses the possible ranges that the
	global variable's address (not its value) is in, in the same format as
	``range`` metadata, with the extension that the pair ``all-ones,all-ones``
	may be used to represent the full set.

	Example (assuming 64-bit pointers):

	.. code-block:: llvm

	@a = external global i8, !absolute_symbol !0 ; Absolute symbol in range [0,256)
	@b = external global i8, !absolute_symbol !1 ; Absolute symbol in range [0,2^64)

	...
	!0 = !{ i64 0, i64 256 }
	!1 = !{ i64 -1, i64 -1 }

	'``unpredictable``' Metadata
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	``unpredictable`` metadata may be attached to any branch or switch
	instruction. It can be used to express the unpredictability of control
	flow. Similar to the llvm.expect intrinsic, it may be used to alter
	optimizations related to compare and branch instructions. The metadata
	is treated as a boolean value; if it exists, it signals that the branch
	or switch that it is attached to is completely unpredictable.

	'``llvm.loop``'
	^^^^^^^^^^^^^^^

	It is sometimes useful to attach information to loop constructs. Currently,
	loop metadata is implemented as metadata attached to the branch instruction
	in the loop latch block. This type of metadata refer to a metadata node that is
	guaranteed to be separate for each loop. The loop identifier metadata is
	specified with the name ``llvm.loop``.

	The loop identifier metadata is implemented using a metadata that refers to
	itself to avoid merging it with any other identifier metadata, e.g.,
	during module linkage or function inlining. That is, each loop should refer
	to their own identification metadata even if they reside in separate functions.
	The following example contains loop identifier metadata for two separate loop
	constructs:

	.. code-block:: llvm

	!0 = !{!0}
	!1 = !{!1}

	The loop identifier metadata can be used to specify additional
	per-loop metadata. Any operands after the first operand can be treated
	as user-defined metadata. For example the ``llvm.loop.unroll.count``
	suggests an unroll factor to the loop unroller:

	.. code-block:: llvm

	br i1 %exitcond, label %._crit_edge, label %.lr.ph, !llvm.loop !0
	...
	!0 = !{!0, !1}
	!1 = !{!"llvm.loop.unroll.count", i32 4}

	'``llvm.loop.vectorize``' and '``llvm.loop.interleave``'
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Metadata prefixed with ``llvm.loop.vectorize`` or ``llvm.loop.interleave`` are
	used to control per-loop vectorization and interleaving parameters such as
	vectorization width and interleave count. These metadata should be used in
	conjunction with ``llvm.loop`` loop identification metadata. The
	``llvm.loop.vectorize`` and ``llvm.loop.interleave`` metadata are only
	optimization hints and the optimizer will only interleave and vectorize loops if
	it believes it is safe to do so. The ``llvm.mem.parallel_loop_access`` metadata
	which contains information about loop-carried memory dependencies can be helpful
	in determining the safety of these transformations.

	'``llvm.loop.interleave.count``' Metadata
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	This metadata suggests an interleave count to the loop interleaver.
	The first operand is the string ``llvm.loop.interleave.count`` and the
	second operand is an integer specifying the interleave count. For
	example:

	.. code-block:: llvm

	!0 = !{!"llvm.loop.interleave.count", i32 4}

	Note that setting ``llvm.loop.interleave.count`` to 1 disables interleaving
	multiple iterations of the loop. If ``llvm.loop.interleave.count`` is set to 0
	then the interleave count will be determined automatically.

	'``llvm.loop.vectorize.enable``' Metadata
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	This metadata selectively enables or disables vectorization for the loop. The
	first operand is the string ``llvm.loop.vectorize.enable`` and the second operand
	is a bit. If the bit operand value is 1 vectorization is enabled. A value of
	0 disables vectorization:

	.. code-block:: llvm

	!0 = !{!"llvm.loop.vectorize.enable", i1 0}
	!1 = !{!"llvm.loop.vectorize.enable", i1 1}

	'``llvm.loop.vectorize.width``' Metadata
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	This metadata sets the target width of the vectorizer. The first
	operand is the string ``llvm.loop.vectorize.width`` and the second
	operand is an integer specifying the width. For example:

	.. code-block:: llvm

	!0 = !{!"llvm.loop.vectorize.width", i32 4}

	Note that setting ``llvm.loop.vectorize.width`` to 1 disables
	vectorization of the loop. If ``llvm.loop.vectorize.width`` is set to
	0 or if the loop does not have this metadata the width will be
	determined automatically.

	'``llvm.loop.unroll``'
	^^^^^^^^^^^^^^^^^^^^^^

	Metadata prefixed with ``llvm.loop.unroll`` are loop unrolling
	optimization hints such as the unroll factor. ``llvm.loop.unroll``
	metadata should be used in conjunction with ``llvm.loop`` loop
	identification metadata. The ``llvm.loop.unroll`` metadata are only
	optimization hints and the unrolling will only be performed if the
	optimizer believes it is safe to do so.

	'``llvm.loop.unroll.count``' Metadata
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	This metadata suggests an unroll factor to the loop unroller. The
	first operand is the string ``llvm.loop.unroll.count`` and the second
	operand is a positive integer specifying the unroll factor. For
	example:

	.. code-block:: llvm

	!0 = !{!"llvm.loop.unroll.count", i32 4}

	If the trip count of the loop is less than the unroll count the loop
	will be partially unrolled.

	'``llvm.loop.unroll.disable``' Metadata
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	This metadata disables loop unrolling. The metadata has a single operand
	which is the string ``llvm.loop.unroll.disable``. For example:

	.. code-block:: llvm

	!0 = !{!"llvm.loop.unroll.disable"}

	'``llvm.loop.unroll.runtime.disable``' Metadata
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	This metadata disables runtime loop unrolling. The metadata has a single
	operand which is the string ``llvm.loop.unroll.runtime.disable``. For example:

	.. code-block:: llvm

	!0 = !{!"llvm.loop.unroll.runtime.disable"}

	'``llvm.loop.unroll.enable``' Metadata
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	This metadata suggests that the loop should be fully unrolled if the trip count
	is known at compile time and partially unrolled if the trip count is not known
	at compile time. The metadata has a single operand which is the string
	``llvm.loop.unroll.enable``. For example:

	.. code-block:: llvm

	!0 = !{!"llvm.loop.unroll.enable"}

	'``llvm.loop.unroll.full``' Metadata
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	This metadata suggests that the loop should be unrolled fully. The
	metadata has a single operand which is the string ``llvm.loop.unroll.full``.
	For example:

	.. code-block:: llvm

	!0 = !{!"llvm.loop.unroll.full"}

	'``llvm.loop.licm_versioning.disable``' Metadata
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	This metadata indicates that the loop should not be versioned for the purpose
	of enabling loop-invariant code motion (LICM). The metadata has a single operand
	which is the string ``llvm.loop.licm_versioning.disable``. For example:

	.. code-block:: llvm

	!0 = !{!"llvm.loop.licm_versioning.disable"}

	'``llvm.loop.distribute.enable``' Metadata
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Loop distribution allows splitting a loop into multiple loops. Currently,
	this is only performed if the entire loop cannot be vectorized due to unsafe
	memory dependencies. The transformation will attempt to isolate the unsafe
	dependencies into their own loop.

	This metadata can be used to selectively enable or disable distribution of the
	loop. The first operand is the string ``llvm.loop.distribute.enable`` and the
	second operand is a bit. If the bit operand value is 1 distribution is
	enabled. A value of 0 disables distribution:

	.. code-block:: llvm

	!0 = !{!"llvm.loop.distribute.enable", i1 0}
	!1 = !{!"llvm.loop.distribute.enable", i1 1}

	This metadata should be used in conjunction with ``llvm.loop`` loop
	identification metadata.

	'``llvm.mem``'
	^^^^^^^^^^^^^^^

	Metadata types used to annotate memory accesses with information helpful
	for optimizations are prefixed with ``llvm.mem``.

	'``llvm.mem.parallel_loop_access``' Metadata
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	The ``llvm.mem.parallel_loop_access`` metadata refers to a loop identifier,
	or metadata containing a list of loop identifiers for nested loops.
	The metadata is attached to memory accessing instructions and denotes that
	no loop carried memory dependence exist between it and other instructions denoted
	with the same loop identifier. The metadata on memory reads also implies that
	if conversion (i.e. speculative execution within a loop iteration) is safe.

	Precisely, given two instructions ``m1`` and ``m2`` that both have the
	``llvm.mem.parallel_loop_access`` metadata, with ``L1`` and ``L2`` being the
	set of loops associated with that metadata, respectively, then there is no loop
	carried dependence between ``m1`` and ``m2`` for loops in both ``L1`` and
	``L2``.

	As a special case, if all memory accessing instructions in a loop have
	``llvm.mem.parallel_loop_access`` metadata that refers to that loop, then the
	loop has no loop carried memory dependences and is considered to be a parallel
	loop.

	Note that if not all memory access instructions have such metadata referring to
	the loop, then the loop is considered not being trivially parallel. Additional
	memory dependence analysis is required to make that determination. As a fail
	safe mechanism, this causes loops that were originally parallel to be considered
	sequential (if optimization passes that are unaware of the parallel semantics
	insert new memory instructions into the loop body).

	Example of a loop that is considered parallel due to its correct use of
	both ``llvm.loop`` and ``llvm.mem.parallel_loop_access``
	metadata types that refer to the same loop identifier metadata.

	.. code-block:: llvm

	for.body:
	...
	%val0 = load i32, i32* %arrayidx, !llvm.mem.parallel_loop_access !0
	...
	store i32 %val0, i32* %arrayidx1, !llvm.mem.parallel_loop_access !0
	...
	br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0

	for.end:
	...
	!0 = !{!0}

	It is also possible to have nested parallel loops. In that case the
	memory accesses refer to a list of loop identifier metadata nodes instead of
	the loop identifier metadata node directly:

	.. code-block:: llvm

	outer.for.body:
	...
	%val1 = load i32, i32* %arrayidx3, !llvm.mem.parallel_loop_access !2
	...
	br label %inner.for.body

	inner.for.body:
	...
	%val0 = load i32, i32* %arrayidx1, !llvm.mem.parallel_loop_access !0
	...
	store i32 %val0, i32* %arrayidx2, !llvm.mem.parallel_loop_access !0
	...
	br i1 %exitcond, label %inner.for.end, label %inner.for.body, !llvm.loop !1

	inner.for.end:
	...
	store i32 %val1, i32* %arrayidx4, !llvm.mem.parallel_loop_access !2
	...
	br i1 %exitcond, label %outer.for.end, label %outer.for.body, !llvm.loop !2

	outer.for.end: ; preds = %for.body
	...
	!0 = !{!1, !2} ; a list of loop identifiers
	!1 = !{!1} ; an identifier for the inner loop
	!2 = !{!2} ; an identifier for the outer loop

	'``invariant.group``' Metadata
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	The ``invariant.group`` metadata may be attached to ``load``/``store`` instructions.
	The existence of the ``invariant.group`` metadata on the instruction tells
	the optimizer that every ``load`` and ``store`` to the same pointer operand
	within the same invariant group can be assumed to load or store the same
	value (but see the ``llvm.invariant.group.barrier`` intrinsic which affects
	when two pointers are considered the same). Pointers returned by bitcast or
	getelementptr with only zero indices are considered the same.

	Examples:

	.. code-block:: llvm

	@unknownPtr = external global i8
	...
	%ptr = alloca i8
	store i8 42, i8* %ptr, !invariant.group !0
	call void @foo(i8* %ptr)

	%a = load i8, i8* %ptr, !invariant.group !0 ; Can assume that value under %ptr didn't change
	call void @foo(i8* %ptr)
	%b = load i8, i8* %ptr, !invariant.group !1 ; Can't assume anything, because group changed

	%newPtr = call i8* @getPointer(i8* %ptr)
	%c = load i8, i8* %newPtr, !invariant.group !0 ; Can't assume anything, because we only have information about %ptr

	%unknownValue = load i8, i8* @unknownPtr
	store i8 %unknownValue, i8* %ptr, !invariant.group !0 ; Can assume that %unknownValue == 42

	call void @foo(i8* %ptr)
	%newPtr2 = call i8* @llvm.invariant.group.barrier(i8* %ptr)
	%d = load i8, i8* %newPtr2, !invariant.group !0 ; Can't step through invariant.group.barrier to get value of %ptr

	...
	declare void @foo(i8*)
	declare i8* @getPointer(i8*)
	declare i8* @llvm.invariant.group.barrier(i8*)

	!0 = !{!"magic ptr"}
	!1 = !{!"other ptr"}

	The invariant.group metadata must be dropped when replacing one pointer by
	another based on aliasing information. This is because invariant.group is tied
	to the SSA value of the pointer operand.

	.. code-block:: llvm

	%v = load i8, i8* %x, !invariant.group !0
	; if %x mustalias %y then we can replace the above instruction with
	%v = load i8, i8* %y


	'``type``' Metadata
	^^^^^^^^^^^^^^^^^^^

	See :doc:`TypeMetadata`.

	'``associated``' Metadata
	^^^^^^^^^^^^^^^^^^^^^^^^^

	The ``associated`` metadata may be attached to a global object
	declaration with a single argument that references another global object.

	This metadata prevents discarding of the global object in linker GC
	unless the referenced object is also discarded. The linker support for
	this feature is spotty. For best compatibility, globals carrying this
	metadata may also:

	- Be in a comdat with the referenced global.
	- Be in @llvm.compiler.used.
	- Have an explicit section with a name which is a valid C identifier.

	It does not have any effect on non-ELF targets.

	Example:

	.. code-block:: llvm

	$a = comdat any
	@a = global i32 1, comdat $a
	@b = internal global i32 2, comdat $a, section "abc", !associated !0
	!0 = !{i32* @a}


	'``prof``' Metadata
	^^^^^^^^^^^^^^^^^^^

	The ``prof`` metadata is used to record profile data in the IR.
	The first operand of the metadata node indicates the profile metadata
	type. There are currently 3 types:
	:ref:`branch_weights<prof_node_branch_weights>`,
	:ref:`function_entry_count<prof_node_function_entry_count>`, and
	:ref:`VP<prof_node_VP>`.

	.. _prof_node_branch_weights:

	branch_weights
	""""""""""""""

	Branch weight metadata attached to a branch, select, switch or call instruction
	represents the likeliness of the associated branch being taken.
	For more information, see :doc:`BranchWeightMetadata`.

	.. _prof_node_function_entry_count:

	function_entry_count
	""""""""""""""""""""

	Function entry count metadata can be attached to function definitions
	to record the number of times the function is called. Used with BFI
	information, it is also used to derive the basic block profile count.
	For more information, see :doc:`BranchWeightMetadata`.

	.. _prof_node_VP:

	VP
	""

	VP (value profile) metadata can be attached to instructions that have
	value profile information. Currently this is indirect calls (where it
	records the hottest callees) and calls to memory intrinsics such as memcpy,
	memmove, and memset (where it records the hottest byte lengths).

	Each VP metadata node contains "VP" string, then a uint32_t value for the value
	profiling kind, a uint64_t value for the total number of times the instruction
	is executed, followed by uint64_t value and execution count pairs.
	The value profiling kind is 0 for indirect call targets and 1 for memory
	operations. For indirect call targets, each profile value is a hash
	of the callee function name, and for memory operations each value is the
	byte length.

	Note that the value counts do not need to add up to the total count
	listed in the third operand (in practice only the top hottest values
	are tracked and reported).

	Indirect call example:

	.. code-block:: llvm

	call void %f(), !prof !1
	!1 = !{!"VP", i32 0, i64 1600, i64 7651369219802541373, i64 1030, i64 -4377547752858689819, i64 410}

	Note that the VP type is 0 (the second operand), which indicates this is
	an indirect call value profile data. The third operand indicates that the
	indirect call executed 1600 times. The 4th and 6th operands give the
	hashes of the 2 hottest target functions' names (this is the same hash used
	to represent function names in the profile database), and the 5th and 7th
	operands give the execution count that each of the respective prior target
	functions was called.

	Module Flags Metadata
	=====================

	Information about the module as a whole is difficult to convey to LLVM's
	subsystems. The LLVM IR isn't sufficient to transmit this information.
	The ``llvm.module.flags`` named metadata exists in order to facilitate
	this. These flags are in the form of key / value pairs --- much like a
	dictionary --- making it easy for any subsystem who cares about a flag to
	look it up.

	The ``llvm.module.flags`` metadata contains a list of metadata triplets.
	Each triplet has the following form:

	- The first element is a behavior flag, which specifies the behavior
	when two (or more) modules are merged together, and it encounters two
	(or more) metadata with the same ID. The supported behaviors are
	described below.
	- The second element is a metadata string that is a unique ID for the
	metadata. Each module may only have one flag entry for each unique ID (not
	including entries with the Require behavior).
	- The third element is the value of the flag.

	When two (or more) modules are merged together, the resulting
	``llvm.module.flags`` metadata is the union of the modules' flags. That is, for
	each unique metadata ID string, there will be exactly one entry in the merged
	modules ``llvm.module.flags`` metadata table, and the value for that entry will
	be determined by the merge behavior flag, as described below. The only exception
	is that entries with the Require behavior are always preserved.

	The following behaviors are supported:

	.. list-table::
	:header-rows: 1
	:widths: 10 90

	* - Value
	- Behavior

	* - 1
	- Error
	Emits an error if two values disagree, otherwise the resulting value
	is that of the operands.

	* - 2
	- Warning
	Emits a warning if two values disagree. The result value will be the
	operand for the flag from the first module being linked.

	* - 3
	- Require
	Adds a requirement that another module flag be present and have a
	specified value after linking is performed. The value must be a
	metadata pair, where the first element of the pair is the ID of the
	module flag to be restricted, and the second element of the pair is
	the value the module flag should be restricted to. This behavior can
	be used to restrict the allowable results (via triggering of an
	error) of linking IDs with the Override behavior.

	* - 4
	- Override
	Uses the specified value, regardless of the behavior or value of the
	other module. If both modules specify Override, but the values
	differ, an error will be emitted.

	* - 5
	- Append
	Appends the two values, which are required to be metadata nodes.

	* - 6
	- AppendUnique
	Appends the two values, which are required to be metadata
	nodes. However, duplicate entries in the second list are dropped
	during the append operation.

	+ * - 7
	+ - Max
	+ Takes the max of the two values, which are required to be integers.
	+
	It is an error for a particular unique flag ID to have multiple behaviors,
	except in the case of Require (which adds restrictions on another metadata
	value) or Override.

	An example of module flags:

	.. code-block:: llvm

	!0 = !{ i32 1, !"foo", i32 1 }
	!1 = !{ i32 4, !"bar", i32 37 }
	!2 = !{ i32 2, !"qux", i32 42 }
	!3 = !{ i32 3, !"qux",
	!{
	!"foo", i32 1
	}
	}
	!llvm.module.flags = !{ !0, !1, !2, !3 }

	- Metadata ``!0`` has the ID ``!"foo"`` and the value '1'. The behavior
	if two or more ``!"foo"`` flags are seen is to emit an error if their
	values are not equal.

	- Metadata ``!1`` has the ID ``!"bar"`` and the value '37'. The
	behavior if two or more ``!"bar"`` flags are seen is to use the value
	'37'.

	- Metadata ``!2`` has the ID ``!"qux"`` and the value '42'. The
	behavior if two or more ``!"qux"`` flags are seen is to emit a
	warning if their values are not equal.

	- Metadata ``!3`` has the ID ``!"qux"`` and the value:

	::

	!{ !"foo", i32 1 }

	The behavior is to emit an error if the ``llvm.module.flags`` does not
	contain a flag with the ID ``!"foo"`` that has the value '1' after linking is
	performed.

	Objective-C Garbage Collection Module Flags Metadata
	----------------------------------------------------

	On the Mach-O platform, Objective-C stores metadata about garbage
	collection in a special section called "image info". The metadata
	consists of a version number and a bitmask specifying what types of
	garbage collection are supported (if any) by the file. If two or more
	modules are linked together their garbage collection metadata needs to
	be merged rather than appended together.

	The Objective-C garbage collection module flags metadata consists of the
	following key-value pairs:

	.. list-table::
	:header-rows: 1
	:widths: 30 70

	* - Key
	- Value

	* - ``Objective-C Version``
	- [Required] --- The Objective-C ABI version. Valid values are 1 and 2.

	* - ``Objective-C Image Info Version``
	- [Required] --- The version of the image info section. Currently
	always 0.

	* - ``Objective-C Image Info Section``
	- [Required] --- The section to place the metadata. Valid values are
	``"__OBJC, __image_info, regular"`` for Objective-C ABI version 1, and
	``"__DATA,__objc_imageinfo, regular, no_dead_strip"`` for
	Objective-C ABI version 2.

	* - ``Objective-C Garbage Collection``
	- [Required] --- Specifies whether garbage collection is supported or
	not. Valid values are 0, for no garbage collection, and 2, for garbage
	collection supported.

	* - ``Objective-C GC Only``
	- [Optional] --- Specifies that only garbage collection is supported.
	If present, its value must be 6. This flag requires that the
	``Objective-C Garbage Collection`` flag have the value 2.

	Some important flag interactions:

	- If a module with ``Objective-C Garbage Collection`` set to 0 is
	merged with a module with ``Objective-C Garbage Collection`` set to
	2, then the resulting module has the
	``Objective-C Garbage Collection`` flag set to 0.
	- A module with ``Objective-C Garbage Collection`` set to 0 cannot be
	merged with a module with ``Objective-C GC Only`` set to 6.

	C type width Module Flags Metadata
	----------------------------------

	The ARM backend emits a section into each generated object file describing the
	options that it was compiled with (in a compiler-independent way) to prevent
	linking incompatible objects, and to allow automatic library selection. Some
	of these options are not visible at the IR level, namely wchar_t width and enum
	width.

	To pass this information to the backend, these options are encoded in module
	flags metadata, using the following key-value pairs:

	.. list-table::
	:header-rows: 1
	:widths: 30 70

	* - Key
	- Value

	* - short_wchar
	- * 0 --- sizeof(wchar_t) == 4
	* 1 --- sizeof(wchar_t) == 2

	* - short_enum
	- * 0 --- Enums are at least as large as an ``int``.
	* 1 --- Enums are stored in the smallest integer type which can
	represent all of its values.

	For example, the following metadata section specifies that the module was
	compiled with a ``wchar_t`` width of 4 bytes, and the underlying type of an
	enum is the smallest type which can represent all of its values::

	!llvm.module.flags = !{!0, !1}
	!0 = !{i32 1, !"short_wchar", i32 1}
	!1 = !{i32 1, !"short_enum", i32 0}

	Automatic Linker Flags Named Metadata
	=====================================

	Some targets support embedding flags to the linker inside individual object
	files. Typically this is used in conjunction with language extensions which
	allow source files to explicitly declare the libraries they depend on, and have
	these automatically be transmitted to the linker via object files.

	These flags are encoded in the IR using named metadata with the name
	``!llvm.linker.options``. Each operand is expected to be a metadata node
	which should be a list of other metadata nodes, each of which should be a
	list of metadata strings defining linker options.

	For example, the following metadata section specifies two separate sets of
	linker options, presumably to link against ``libz`` and the ``Cocoa``
	framework::

	!0 = !{ !"-lz" },
	!1 = !{ !"-framework", !"Cocoa" } } }
	!llvm.linker.options = !{ !0, !1 }

	The metadata encoding as lists of lists of options, as opposed to a collapsed
	list of options, is chosen so that the IR encoding can use multiple option
	strings to specify e.g., a single library, while still having that specifier be
	preserved as an atomic element that can be recognized by a target specific
	assembly writer or object file emitter.

	Each individual option is required to be either a valid option for the target's
	linker, or an option that is reserved by the target specific assembly writer or
	object file emitter. No other aspect of these options is defined by the IR.

	.. _intrinsicglobalvariables:

	Intrinsic Global Variables
	==========================

	LLVM has a number of "magic" global variables that contain data that
	affect code generation or other IR semantics. These are documented here.
	All globals of this sort should have a section specified as
	"``llvm.metadata``". This section and all globals that start with
	"``llvm.``" are reserved for use by LLVM.

	.. _gv_llvmused:

	The '``llvm.used``' Global Variable
	-----------------------------------

	The ``@llvm.used`` global is an array which has
	:ref:`appending linkage <linkage_appending>`. This array contains a list of
	pointers to named global variables, functions and aliases which may optionally
	have a pointer cast formed of bitcast or getelementptr. For example, a legal
	use of it is:

	.. code-block:: llvm

	@X = global i8 4
	@Y = global i32 123

	@llvm.used = appending global [2 x i8*] [
	i8* @X,
	i8* bitcast (i32* @Y to i8*)
	], section "llvm.metadata"

	If a symbol appears in the ``@llvm.used`` list, then the compiler, assembler,
	and linker are required to treat the symbol as if there is a reference to the
	symbol that it cannot see (which is why they have to be named). For example, if
	a variable has internal linkage and no references other than that from the
	``@llvm.used`` list, it cannot be deleted. This is commonly used to represent
	references from inline asms and other things the compiler cannot "see", and
	corresponds to "``attribute((used))``" in GNU C.

	On some targets, the code generator must emit a directive to the
	assembler or object file to prevent the assembler and linker from
	molesting the symbol.

	.. _gv_llvmcompilerused:

	The '``llvm.compiler.used``' Global Variable
	--------------------------------------------

	The ``@llvm.compiler.used`` directive is the same as the ``@llvm.used``
	directive, except that it only prevents the compiler from touching the
	symbol. On targets that support it, this allows an intelligent linker to
	optimize references to the symbol without being impeded as it would be
	by ``@llvm.used``.

	This is a rare construct that should only be used in rare circumstances,
	and should not be exposed to source languages.

	.. _gv_llvmglobalctors:

	The '``llvm.global_ctors``' Global Variable
	-------------------------------------------

	.. code-block:: llvm

	%0 = type { i32, void (), i8 }
	@llvm.global_ctors = appending global [1 x %0] [%0 { i32 65535, void ()* @ctor, i8* @data }]

	The ``@llvm.global_ctors`` array contains a list of constructor
	functions, priorities, and an optional associated global or function.
	The functions referenced by this array will be called in ascending order
	of priority (i.e. lowest first) when the module is loaded. The order of
	functions with the same priority is not defined.

	If the third field is present, non-null, and points to a global variable
	or function, the initializer function will only run if the associated
	data from the current module is not discarded.

	.. _llvmglobaldtors:

	The '``llvm.global_dtors``' Global Variable
	-------------------------------------------

	.. code-block:: llvm

	%0 = type { i32, void (), i8 }
	@llvm.global_dtors = appending global [1 x %0] [%0 { i32 65535, void ()* @dtor, i8* @data }]

	The ``@llvm.global_dtors`` array contains a list of destructor
	functions, priorities, and an optional associated global or function.
	The functions referenced by this array will be called in descending
	order of priority (i.e. highest first) when the module is unloaded. The
	order of functions with the same priority is not defined.

	If the third field is present, non-null, and points to a global variable
	or function, the destructor function will only run if the associated
	data from the current module is not discarded.

	Instruction Reference
	=====================

	The LLVM instruction set consists of several different classifications
	of instructions: :ref:`terminator instructions <terminators>`, :ref:`binary
	instructions <binaryops>`, :ref:`bitwise binary
	instructions <bitwiseops>`, :ref:`memory instructions <memoryops>`, and
	:ref:`other instructions <otherops>`.

	.. _terminators:

	Terminator Instructions
	-----------------------

	As mentioned :ref:`previously <functionstructure>`, every basic block in a
	program ends with a "Terminator" instruction, which indicates which
	block should be executed after the current block is finished. These
	terminator instructions typically yield a '``void``' value: they produce
	control flow, not values (the one exception being the
	':ref:`invoke <i_invoke>`' instruction).

	The terminator instructions are: ':ref:`ret <i_ret>`',
	':ref:`br <i_br>`', ':ref:`switch <i_switch>`',
	':ref:`indirectbr <i_indirectbr>`', ':ref:`invoke <i_invoke>`',
	':ref:`resume <i_resume>`', ':ref:`catchswitch <i_catchswitch>`',
	':ref:`catchret <i_catchret>`',
	':ref:`cleanupret <i_cleanupret>`',
	and ':ref:`unreachable <i_unreachable>`'.

	.. _i_ret:

	'``ret``' Instruction
	^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	ret <type> <value> ; Return a value from a non-void function
	ret void ; Return from void function

	Overview:
	"""""""""

	The '``ret``' instruction is used to return control flow (and optionally
	a value) from a function back to the caller.

	There are two forms of the '``ret``' instruction: one that returns a
	value and then causes control flow, and one that just causes control
	flow to occur.

	Arguments:
	""""""""""

	The '``ret``' instruction optionally accepts a single argument, the
	return value. The type of the return value must be a ':ref:`first
	class <t_firstclass>`' type.

	A function is not :ref:`well formed <wellformed>` if it it has a non-void
	return type and contains a '``ret``' instruction with no return value or
	a return value with a type that does not match its type, or if it has a
	void return type and contains a '``ret``' instruction with a return
	value.

	Semantics:
	""""""""""

	When the '``ret``' instruction is executed, control flow returns back to
	the calling function's context. If the caller is a
	":ref:`call <i_call>`" instruction, execution continues at the
	instruction after the call. If the caller was an
	":ref:`invoke <i_invoke>`" instruction, execution continues at the
	beginning of the "normal" destination block. If the instruction returns
	a value, that value shall set the call or invoke instruction's return
	value.

	Example:
	""""""""

	.. code-block:: llvm

	ret i32 5 ; Return an integer value of 5
	ret void ; Return from a void function
	ret { i32, i8 } { i32 4, i8 2 } ; Return a struct of values 4 and 2

	.. _i_br:

	'``br``' Instruction
	^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	br i1 <cond>, label <iftrue>, label <iffalse>
	br label <dest> ; Unconditional branch

	Overview:
	"""""""""

	The '``br``' instruction is used to cause control flow to transfer to a
	different basic block in the current function. There are two forms of
	this instruction, corresponding to a conditional branch and an
	unconditional branch.

	Arguments:
	""""""""""

	The conditional branch form of the '``br``' instruction takes a single
	'``i1``' value and two '``label``' values. The unconditional form of the
	'``br``' instruction takes a single '``label``' value as a target.

	Semantics:
	""""""""""

	Upon execution of a conditional '``br``' instruction, the '``i1``'
	argument is evaluated. If the value is ``true``, control flows to the
	'``iftrue``' ``label`` argument. If "cond" is ``false``, control flows
	to the '``iffalse``' ``label`` argument.

	Example:
	""""""""

	.. code-block:: llvm

	Test:
	%cond = icmp eq i32 %a, %b
	br i1 %cond, label %IfEqual, label %IfUnequal
	IfEqual:
	ret i32 1
	IfUnequal:
	ret i32 0

	.. _i_switch:

	'``switch``' Instruction
	^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	switch <intty> <value>, label <defaultdest> [ <intty> <val>, label <dest> ... ]

	Overview:
	"""""""""

	The '``switch``' instruction is used to transfer control flow to one of
	several different places. It is a generalization of the '``br``'
	instruction, allowing a branch to occur to one of many possible
	destinations.

	Arguments:
	""""""""""

	The '``switch``' instruction uses three parameters: an integer
	comparison value '``value``', a default '``label``' destination, and an
	array of pairs of comparison value constants and '``label``'s. The table
	is not allowed to contain duplicate constant entries.

	Semantics:
	""""""""""

	The ``switch`` instruction specifies a table of values and destinations.
	When the '``switch``' instruction is executed, this table is searched
	for the given value. If the value is found, control flow is transferred
	to the corresponding destination; otherwise, control flow is transferred
	to the default destination.

	Implementation:
	"""""""""""""""

	Depending on properties of the target machine and the particular
	``switch`` instruction, this instruction may be code generated in
	different ways. For example, it could be generated as a series of
	chained conditional branches or with a lookup table.

	Example:
	""""""""

	.. code-block:: llvm

	; Emulate a conditional br instruction
	%Val = zext i1 %value to i32
	switch i32 %Val, label %truedest [ i32 0, label %falsedest ]

	; Emulate an unconditional br instruction
	switch i32 0, label %dest [ ]

	; Implement a jump table:
	switch i32 %val, label %otherwise [ i32 0, label %onzero
	i32 1, label %onone
	i32 2, label %ontwo ]

	.. _i_indirectbr:

	'``indirectbr``' Instruction
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	indirectbr <somety>* <address>, [ label <dest1>, label <dest2>, ... ]

	Overview:
	"""""""""

	The '``indirectbr``' instruction implements an indirect branch to a
	label within the current function, whose address is specified by
	"``address``". Address must be derived from a
	:ref:`blockaddress <blockaddress>` constant.

	Arguments:
	""""""""""

	The '``address``' argument is the address of the label to jump to. The
	rest of the arguments indicate the full set of possible destinations
	that the address may point to. Blocks are allowed to occur multiple
	times in the destination list, though this isn't particularly useful.

	This destination list is required so that dataflow analysis has an
	accurate understanding of the CFG.

	Semantics:
	""""""""""

	Control transfers to the block specified in the address argument. All
	possible destination blocks must be listed in the label list, otherwise
	this instruction has undefined behavior. This implies that jumps to
	labels defined in other functions have undefined behavior as well.

	Implementation:
	"""""""""""""""

	This is typically implemented with a jump through a register.

	Example:
	""""""""

	.. code-block:: llvm

	indirectbr i8* %Addr, [ label %bb1, label %bb2, label %bb3 ]

	.. _i_invoke:

	'``invoke``' Instruction
	^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	<result> = invoke [cconv] [ret attrs] <ty>\|<fnty> <fnptrval>(<function args>) [fn attrs]
	[operand bundles] to label <normal label> unwind label <exception label>

	Overview:
	"""""""""

	The '``invoke``' instruction causes control to transfer to a specified
	function, with the possibility of control flow transfer to either the
	'``normal``' label or the '``exception``' label. If the callee function
	returns with the "``ret``" instruction, control flow will return to the
	"normal" label. If the callee (or any indirect callees) returns via the
	":ref:`resume <i_resume>`" instruction or other exception handling
	mechanism, control is interrupted and continued at the dynamically
	nearest "exception" label.

	The '``exception``' label is a `landing
	pad <ExceptionHandling.html#overview>`_ for the exception. As such,
	'``exception``' label is required to have the
	":ref:`landingpad <i_landingpad>`" instruction, which contains the
	information about the behavior of the program after unwinding happens,
	as its first non-PHI instruction. The restrictions on the
	"``landingpad``" instruction's tightly couples it to the "``invoke``"
	instruction, so that the important information contained within the
	"``landingpad``" instruction can't be lost through normal code motion.

	Arguments:
	""""""""""

	This instruction requires several arguments:

	#. The optional "cconv" marker indicates which :ref:`calling
	convention <callingconv>` the call should use. If none is
	specified, the call defaults to using C calling conventions.
	#. The optional :ref:`Parameter Attributes <paramattrs>` list for return
	values. Only '``zeroext``', '``signext``', and '``inreg``' attributes
	are valid here.
	#. '``ty``': the type of the call instruction itself which is also the
	type of the return value. Functions that return no value are marked
	``void``.
	#. '``fnty``': shall be the signature of the function being invoked. The
	argument types must match the types implied by this signature. This
	type can be omitted if the function is not varargs.
	#. '``fnptrval``': An LLVM value containing a pointer to a function to
	be invoked. In most cases, this is a direct function invocation, but
	indirect ``invoke``'s are just as possible, calling an arbitrary pointer
	to function value.
	#. '``function args``': argument list whose types match the function
	signature argument types and parameter attributes. All arguments must
	be of :ref:`first class <t_firstclass>` type. If the function signature
	indicates the function accepts a variable number of arguments, the
	extra arguments can be specified.
	#. '``normal label``': the label reached when the called function
	executes a '``ret``' instruction.
	#. '``exception label``': the label reached when a callee returns via
	the :ref:`resume <i_resume>` instruction or other exception handling
	mechanism.
	#. The optional :ref:`function attributes <fnattrs>` list.
	#. The optional :ref:`operand bundles <opbundles>` list.

	Semantics:
	""""""""""

	This instruction is designed to operate as a standard '``call``'
	instruction in most regards. The primary difference is that it
	establishes an association with a label, which is used by the runtime
	library to unwind the stack.

	This instruction is used in languages with destructors to ensure that
	proper cleanup is performed in the case of either a ``longjmp`` or a
	thrown exception. Additionally, this is important for implementation of
	'``catch``' clauses in high-level languages that support them.

	For the purposes of the SSA form, the definition of the value returned
	by the '``invoke``' instruction is deemed to occur on the edge from the
	current block to the "normal" label. If the callee unwinds then no
	return value is available.

	Example:
	""""""""

	.. code-block:: llvm

	%retval = invoke i32 @Test(i32 15) to label %Continue
	unwind label %TestCleanup ; i32:retval set
	%retval = invoke coldcc i32 %Testfnptr(i32 15) to label %Continue
	unwind label %TestCleanup ; i32:retval set

	.. _i_resume:

	'``resume``' Instruction
	^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	resume <type> <value>

	Overview:
	"""""""""

	The '``resume``' instruction is a terminator instruction that has no
	successors.

	Arguments:
	""""""""""

	The '``resume``' instruction requires one argument, which must have the
	same type as the result of any '``landingpad``' instruction in the same
	function.

	Semantics:
	""""""""""

	The '``resume``' instruction resumes propagation of an existing
	(in-flight) exception whose unwinding was interrupted with a
	:ref:`landingpad <i_landingpad>` instruction.

	Example:
	""""""""

	.. code-block:: llvm

	resume { i8*, i32 } %exn

	.. _i_catchswitch:

	'``catchswitch``' Instruction
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	<resultval> = catchswitch within <parent> [ label <handler1>, label <handler2>, ... ] unwind to caller
	<resultval> = catchswitch within <parent> [ label <handler1>, label <handler2>, ... ] unwind label <default>

	Overview:
	"""""""""

	The '``catchswitch``' instruction is used by `LLVM's exception handling system
	<ExceptionHandling.html#overview>`_ to describe the set of possible catch handlers
	that may be executed by the :ref:`EH personality routine <personalityfn>`.

	Arguments:
	""""""""""

	The ``parent`` argument is the token of the funclet that contains the
	``catchswitch`` instruction. If the ``catchswitch`` is not inside a funclet,
	this operand may be the token ``none``.

	The ``default`` argument is the label of another basic block beginning with
	either a ``cleanuppad`` or ``catchswitch`` instruction. This unwind destination
	must be a legal target with respect to the ``parent`` links, as described in
	the `exception handling documentation\ <ExceptionHandling.html#wineh-constraints>`_.

	The ``handlers`` are a nonempty list of successor blocks that each begin with a
	:ref:`catchpad <i_catchpad>` instruction.

	Semantics:
	""""""""""

	Executing this instruction transfers control to one of the successors in
	``handlers``, if appropriate, or continues to unwind via the unwind label if
	present.

	The ``catchswitch`` is both a terminator and a "pad" instruction, meaning that
	it must be both the first non-phi instruction and last instruction in the basic
	block. Therefore, it must be the only non-phi instruction in the block.

	Example:
	""""""""

	.. code-block:: text

	dispatch1:
	%cs1 = catchswitch within none [label %handler0, label %handler1] unwind to caller
	dispatch2:
	%cs2 = catchswitch within %parenthandler [label %handler0] unwind label %cleanup

	.. _i_catchret:

	'``catchret``' Instruction
	^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	catchret from <token> to label <normal>

	Overview:
	"""""""""

	The '``catchret``' instruction is a terminator instruction that has a
	single successor.


	Arguments:
	""""""""""

	The first argument to a '``catchret``' indicates which ``catchpad`` it
	exits. It must be a :ref:`catchpad <i_catchpad>`.
	The second argument to a '``catchret``' specifies where control will
	transfer to next.

	Semantics:
	""""""""""

	The '``catchret``' instruction ends an existing (in-flight) exception whose
	unwinding was interrupted with a :ref:`catchpad <i_catchpad>` instruction. The
	:ref:`personality function <personalityfn>` gets a chance to execute arbitrary
	code to, for example, destroy the active exception. Control then transfers to
	``normal``.

	The ``token`` argument must be a token produced by a ``catchpad`` instruction.
	If the specified ``catchpad`` is not the most-recently-entered not-yet-exited
	funclet pad (as described in the `EH documentation\ <ExceptionHandling.html#wineh-constraints>`_),
	the ``catchret``'s behavior is undefined.

	Example:
	""""""""

	.. code-block:: text

	catchret from %catch label %continue

	.. _i_cleanupret:

	'``cleanupret``' Instruction
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	cleanupret from <value> unwind label <continue>
	cleanupret from <value> unwind to caller

	Overview:
	"""""""""

	The '``cleanupret``' instruction is a terminator instruction that has
	an optional successor.


	Arguments:
	""""""""""

	The '``cleanupret``' instruction requires one argument, which indicates
	which ``cleanuppad`` it exits, and must be a :ref:`cleanuppad <i_cleanuppad>`.
	If the specified ``cleanuppad`` is not the most-recently-entered not-yet-exited
	funclet pad (as described in the `EH documentation\ <ExceptionHandling.html#wineh-constraints>`_),
	the ``cleanupret``'s behavior is undefined.

	The '``cleanupret``' instruction also has an optional successor, ``continue``,
	which must be the label of another basic block beginning with either a
	``cleanuppad`` or ``catchswitch`` instruction. This unwind destination must
	be a legal target with respect to the ``parent`` links, as described in the
	`exception handling documentation\ <ExceptionHandling.html#wineh-constraints>`_.

	Semantics:
	""""""""""

	The '``cleanupret``' instruction indicates to the
	:ref:`personality function <personalityfn>` that one
	:ref:`cleanuppad <i_cleanuppad>` it transferred control to has ended.
	It transfers control to ``continue`` or unwinds out of the function.

	Example:
	""""""""

	.. code-block:: text

	cleanupret from %cleanup unwind to caller
	cleanupret from %cleanup unwind label %continue

	.. _i_unreachable:

	'``unreachable``' Instruction
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	unreachable

	Overview:
	"""""""""

	The '``unreachable``' instruction has no defined semantics. This
	instruction is used to inform the optimizer that a particular portion of
	the code is not reachable. This can be used to indicate that the code
	after a no-return function cannot be reached, and other facts.

	Semantics:
	""""""""""

	The '``unreachable``' instruction has no defined semantics.

	.. _binaryops:

	Binary Operations
	-----------------

	Binary operators are used to do most of the computation in a program.
	They require two operands of the same type, execute an operation on
	them, and produce a single value. The operands might represent multiple
	data, as is the case with the :ref:`vector <t_vector>` data type. The
	result value has the same type as its operands.

	There are several different binary operators:

	.. _i_add:

	'``add``' Instruction
	^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	<result> = add <ty> <op1>, <op2> ; yields ty:result
	<result> = add nuw <ty> <op1>, <op2> ; yields ty:result
	<result> = add nsw <ty> <op1>, <op2> ; yields ty:result
	<result> = add nuw nsw <ty> <op1>, <op2> ; yields ty:result

	Overview:
	"""""""""

	The '``add``' instruction returns the sum of its two operands.

	Arguments:
	""""""""""

	The two arguments to the '``add``' instruction must be
	:ref:`integer <t_integer>` or :ref:`vector <t_vector>` of integer values. Both
	arguments must have identical types.

	Semantics:
	""""""""""

	The value produced is the integer sum of the two operands.

	If the sum has unsigned overflow, the result returned is the
	mathematical result modulo 2\ :sup:`n`\ , where n is the bit width of
	the result.

	Because LLVM integers use a two's complement representation, this
	instruction is appropriate for both signed and unsigned integers.

	``nuw`` and ``nsw`` stand for "No Unsigned Wrap" and "No Signed Wrap",
	respectively. If the ``nuw`` and/or ``nsw`` keywords are present, the
	result value of the ``add`` is a :ref:`poison value <poisonvalues>` if
	unsigned and/or signed overflow, respectively, occurs.

	Example:
	""""""""

	.. code-block:: text

	<result> = add i32 4, %var ; yields i32:result = 4 + %var

	.. _i_fadd:

	'``fadd``' Instruction
	^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	<result> = fadd [fast-math flags]* <ty> <op1>, <op2> ; yields ty:result

	Overview:
	"""""""""

	The '``fadd``' instruction returns the sum of its two operands.

	Arguments:
	""""""""""

	The two arguments to the '``fadd``' instruction must be :ref:`floating
	point <t_floating>` or :ref:`vector <t_vector>` of floating point values.
	Both arguments must have identical types.

	Semantics:
	""""""""""

	The value produced is the floating point sum of the two operands. This
	instruction can also take any number of :ref:`fast-math flags <fastmath>`,
	which are optimization hints to enable otherwise unsafe floating point
	optimizations:

	Example:
	""""""""

	.. code-block:: text

	<result> = fadd float 4.0, %var ; yields float:result = 4.0 + %var

	'``sub``' Instruction
	^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	<result> = sub <ty> <op1>, <op2> ; yields ty:result
	<result> = sub nuw <ty> <op1>, <op2> ; yields ty:result
	<result> = sub nsw <ty> <op1>, <op2> ; yields ty:result
	<result> = sub nuw nsw <ty> <op1>, <op2> ; yields ty:result

	Overview:
	"""""""""

	The '``sub``' instruction returns the difference of its two operands.

	Note that the '``sub``' instruction is used to represent the '``neg``'
	instruction present in most other intermediate representations.

	Arguments:
	""""""""""

	The two arguments to the '``sub``' instruction must be
	:ref:`integer <t_integer>` or :ref:`vector <t_vector>` of integer values. Both
	arguments must have identical types.

	Semantics:
	""""""""""

	The value produced is the integer difference of the two operands.

	If the difference has unsigned overflow, the result returned is the
	mathematical result modulo 2\ :sup:`n`\ , where n is the bit width of
	the result.

	Because LLVM integers use a two's complement representation, this
	instruction is appropriate for both signed and unsigned integers.

	``nuw`` and ``nsw`` stand for "No Unsigned Wrap" and "No Signed Wrap",
	respectively. If the ``nuw`` and/or ``nsw`` keywords are present, the
	result value of the ``sub`` is a :ref:`poison value <poisonvalues>` if
	unsigned and/or signed overflow, respectively, occurs.

	Example:
	""""""""

	.. code-block:: text

	<result> = sub i32 4, %var ; yields i32:result = 4 - %var
	<result> = sub i32 0, %val ; yields i32:result = -%var

	.. _i_fsub:

	'``fsub``' Instruction
	^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	<result> = fsub [fast-math flags]* <ty> <op1>, <op2> ; yields ty:result

	Overview:
	"""""""""

	The '``fsub``' instruction returns the difference of its two operands.

	Note that the '``fsub``' instruction is used to represent the '``fneg``'
	instruction present in most other intermediate representations.

	Arguments:
	""""""""""

	The two arguments to the '``fsub``' instruction must be :ref:`floating
	point <t_floating>` or :ref:`vector <t_vector>` of floating point values.
	Both arguments must have identical types.

	Semantics:
	""""""""""

	The value produced is the floating point difference of the two operands.
	This instruction can also take any number of :ref:`fast-math
	flags <fastmath>`, which are optimization hints to enable otherwise
	unsafe floating point optimizations:

	Example:
	""""""""

	.. code-block:: text

	<result> = fsub float 4.0, %var ; yields float:result = 4.0 - %var
	<result> = fsub float -0.0, %val ; yields float:result = -%var

	'``mul``' Instruction
	^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	<result> = mul <ty> <op1>, <op2> ; yields ty:result
	<result> = mul nuw <ty> <op1>, <op2> ; yields ty:result
	<result> = mul nsw <ty> <op1>, <op2> ; yields ty:result
	<result> = mul nuw nsw <ty> <op1>, <op2> ; yields ty:result

	Overview:
	"""""""""

	The '``mul``' instruction returns the product of its two operands.

	Arguments:
	""""""""""

	The two arguments to the '``mul``' instruction must be
	:ref:`integer <t_integer>` or :ref:`vector <t_vector>` of integer values. Both
	arguments must have identical types.

	Semantics:
	""""""""""

	The value produced is the integer product of the two operands.

	If the result of the multiplication has unsigned overflow, the result
	returned is the mathematical result modulo 2\ :sup:`n`\ , where n is the
	bit width of the result.

	Because LLVM integers use a two's complement representation, and the
	result is the same width as the operands, this instruction returns the
	correct result for both signed and unsigned integers. If a full product
	(e.g. ``i32`` * ``i32`` -> ``i64``) is needed, the operands should be
	sign-extended or zero-extended as appropriate to the width of the full
	product.

	``nuw`` and ``nsw`` stand for "No Unsigned Wrap" and "No Signed Wrap",
	respectively. If the ``nuw`` and/or ``nsw`` keywords are present, the
	result value of the ``mul`` is a :ref:`poison value <poisonvalues>` if
	unsigned and/or signed overflow, respectively, occurs.

	Example:
	""""""""

	.. code-block:: text

	<result> = mul i32 4, %var ; yields i32:result = 4 * %var

	.. _i_fmul:

	'``fmul``' Instruction
	^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	<result> = fmul [fast-math flags]* <ty> <op1>, <op2> ; yields ty:result

	Overview:
	"""""""""

	The '``fmul``' instruction returns the product of its two operands.

	Arguments:
	""""""""""

	The two arguments to the '``fmul``' instruction must be :ref:`floating
	point <t_floating>` or :ref:`vector <t_vector>` of floating point values.
	Both arguments must have identical types.

	Semantics:
	""""""""""

	The value produced is the floating point product of the two operands.
	This instruction can also take any number of :ref:`fast-math
	flags <fastmath>`, which are optimization hints to enable otherwise
	unsafe floating point optimizations:

	Example:
	""""""""

	.. code-block:: text

	<result> = fmul float 4.0, %var ; yields float:result = 4.0 * %var

	'``udiv``' Instruction
	^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	<result> = udiv <ty> <op1>, <op2> ; yields ty:result
	<result> = udiv exact <ty> <op1>, <op2> ; yields ty:result

	Overview:
	"""""""""

	The '``udiv``' instruction returns the quotient of its two operands.

	Arguments:
	""""""""""

	The two arguments to the '``udiv``' instruction must be
	:ref:`integer <t_integer>` or :ref:`vector <t_vector>` of integer values. Both
	arguments must have identical types.

	Semantics:
	""""""""""

	The value produced is the unsigned integer quotient of the two operands.

	Note that unsigned integer division and signed integer division are
	distinct operations; for signed integer division, use '``sdiv``'.

	Division by zero is undefined behavior. For vectors, if any element
	of the divisor is zero, the operation has undefined behavior.


	If the ``exact`` keyword is present, the result value of the ``udiv`` is
	a :ref:`poison value <poisonvalues>` if %op1 is not a multiple of %op2 (as
	such, "((a udiv exact b) mul b) == a").

	Example:
	""""""""

	.. code-block:: text

	<result> = udiv i32 4, %var ; yields i32:result = 4 / %var

	'``sdiv``' Instruction
	^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	<result> = sdiv <ty> <op1>, <op2> ; yields ty:result
	<result> = sdiv exact <ty> <op1>, <op2> ; yields ty:result

	Overview:
	"""""""""

	The '``sdiv``' instruction returns the quotient of its two operands.

	Arguments:
	""""""""""

	The two arguments to the '``sdiv``' instruction must be
	:ref:`integer <t_integer>` or :ref:`vector <t_vector>` of integer values. Both
	arguments must have identical types.

	Semantics:
	""""""""""

	The value produced is the signed integer quotient of the two operands
	rounded towards zero.

	Note that signed integer division and unsigned integer division are
	distinct operations; for unsigned integer division, use '``udiv``'.

	Division by zero is undefined behavior. For vectors, if any element
	of the divisor is zero, the operation has undefined behavior.
	Overflow also leads to undefined behavior; this is a rare case, but can
	occur, for example, by doing a 32-bit division of -2147483648 by -1.

	If the ``exact`` keyword is present, the result value of the ``sdiv`` is
	a :ref:`poison value <poisonvalues>` if the result would be rounded.

	Example:
	""""""""

	.. code-block:: text

	<result> = sdiv i32 4, %var ; yields i32:result = 4 / %var

	.. _i_fdiv:

	'``fdiv``' Instruction
	^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	<result> = fdiv [fast-math flags]* <ty> <op1>, <op2> ; yields ty:result

	Overview:
	"""""""""

	The '``fdiv``' instruction returns the quotient of its two operands.

	Arguments:
	""""""""""

	The two arguments to the '``fdiv``' instruction must be :ref:`floating
	point <t_floating>` or :ref:`vector <t_vector>` of floating point values.
	Both arguments must have identical types.

	Semantics:
	""""""""""

	The value produced is the floating point quotient of the two operands.
	This instruction can also take any number of :ref:`fast-math
	flags <fastmath>`, which are optimization hints to enable otherwise
	unsafe floating point optimizations:

	Example:
	""""""""

	.. code-block:: text

	<result> = fdiv float 4.0, %var ; yields float:result = 4.0 / %var

	'``urem``' Instruction
	^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	<result> = urem <ty> <op1>, <op2> ; yields ty:result

	Overview:
	"""""""""

	The '``urem``' instruction returns the remainder from the unsigned
	division of its two arguments.

	Arguments:
	""""""""""

	The two arguments to the '``urem``' instruction must be
	:ref:`integer <t_integer>` or :ref:`vector <t_vector>` of integer values. Both
	arguments must have identical types.

	Semantics:
	""""""""""

	This instruction returns the unsigned integer remainder of a division.
	This instruction always performs an unsigned division to get the
	remainder.

	Note that unsigned integer remainder and signed integer remainder are
	distinct operations; for signed integer remainder, use '``srem``'.

	Taking the remainder of a division by zero is undefined behavior.
	For vectors, if any element of the divisor is zero, the operation has
	undefined behavior.

	Example:
	""""""""

	.. code-block:: text

	<result> = urem i32 4, %var ; yields i32:result = 4 % %var

	'``srem``' Instruction
	^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	<result> = srem <ty> <op1>, <op2> ; yields ty:result

	Overview:
	"""""""""

	The '``srem``' instruction returns the remainder from the signed
	division of its two operands. This instruction can also take
	:ref:`vector <t_vector>` versions of the values in which case the elements
	must be integers.

	Arguments:
	""""""""""

	The two arguments to the '``srem``' instruction must be
	:ref:`integer <t_integer>` or :ref:`vector <t_vector>` of integer values. Both
	arguments must have identical types.

	Semantics:
	""""""""""

	This instruction returns the remainder of a division (where the result
	is either zero or has the same sign as the dividend, ``op1``), not the
	modulo operator (where the result is either zero or has the same sign
	as the divisor, ``op2``) of a value. For more information about the
	difference, see `The Math
	Forum <http://mathforum.org/dr.math/problems/anne.4.28.99.html>`_. For a
	table of how this is implemented in various languages, please see
	`Wikipedia: modulo
	operation <http://en.wikipedia.org/wiki/Modulo_operation>`_.

	Note that signed integer remainder and unsigned integer remainder are
	distinct operations; for unsigned integer remainder, use '``urem``'.

	Taking the remainder of a division by zero is undefined behavior.
	For vectors, if any element of the divisor is zero, the operation has
	undefined behavior.
	Overflow also leads to undefined behavior; this is a rare case, but can
	occur, for example, by taking the remainder of a 32-bit division of
	-2147483648 by -1. (The remainder doesn't actually overflow, but this
	rule lets srem be implemented using instructions that return both the
	result of the division and the remainder.)

	Example:
	""""""""

	.. code-block:: text

	<result> = srem i32 4, %var ; yields i32:result = 4 % %var

	.. _i_frem:

	'``frem``' Instruction
	^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	<result> = frem [fast-math flags]* <ty> <op1>, <op2> ; yields ty:result

	Overview:
	"""""""""

	The '``frem``' instruction returns the remainder from the division of
	its two operands.

	Arguments:
	""""""""""

	The two arguments to the '``frem``' instruction must be :ref:`floating
	point <t_floating>` or :ref:`vector <t_vector>` of floating point values.
	Both arguments must have identical types.

	Semantics:
	""""""""""

	This instruction returns the remainder of a division. The remainder
	has the same sign as the dividend. This instruction can also take any
	number of :ref:`fast-math flags <fastmath>`, which are optimization hints
	to enable otherwise unsafe floating point optimizations:

	Example:
	""""""""

	.. code-block:: text

	<result> = frem float 4.0, %var ; yields float:result = 4.0 % %var

	.. _bitwiseops:

	Bitwise Binary Operations
	-------------------------

	Bitwise binary operators are used to do various forms of bit-twiddling
	in a program. They are generally very efficient instructions and can
	commonly be strength reduced from other instructions. They require two
	operands of the same type, execute an operation on them, and produce a
	single value. The resulting value is the same type as its operands.

	'``shl``' Instruction
	^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	<result> = shl <ty> <op1>, <op2> ; yields ty:result
	<result> = shl nuw <ty> <op1>, <op2> ; yields ty:result
	<result> = shl nsw <ty> <op1>, <op2> ; yields ty:result
	<result> = shl nuw nsw <ty> <op1>, <op2> ; yields ty:result

	Overview:
	"""""""""

	The '``shl``' instruction returns the first operand shifted to the left
	a specified number of bits.

	Arguments:
	""""""""""

	Both arguments to the '``shl``' instruction must be the same
	:ref:`integer <t_integer>` or :ref:`vector <t_vector>` of integer type.
	'``op2``' is treated as an unsigned value.

	Semantics:
	""""""""""

	The value produced is ``op1`` \* 2\ :sup:`op2` mod 2\ :sup:`n`,
	where ``n`` is the width of the result. If ``op2`` is (statically or
	dynamically) equal to or larger than the number of bits in
	``op1``, this instruction returns a :ref:`poison value <poisonvalues>`.
	If the arguments are vectors, each vector element of ``op1`` is shifted
	by the corresponding shift amount in ``op2``.

	If the ``nuw`` keyword is present, then the shift produces a poison
	value if it shifts out any non-zero bits.
	If the ``nsw`` keyword is present, then the shift produces a poison
	value it shifts out any bits that disagree with the resultant sign bit.

	Example:
	""""""""

	.. code-block:: text

	<result> = shl i32 4, %var ; yields i32: 4 << %var
	<result> = shl i32 4, 2 ; yields i32: 16
	<result> = shl i32 1, 10 ; yields i32: 1024
	<result> = shl i32 1, 32 ; undefined
	<result> = shl <2 x i32> < i32 1, i32 1>, < i32 1, i32 2> ; yields: result=<2 x i32> < i32 2, i32 4>

	'``lshr``' Instruction
	^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	<result> = lshr <ty> <op1>, <op2> ; yields ty:result
	<result> = lshr exact <ty> <op1>, <op2> ; yields ty:result

	Overview:
	"""""""""

	The '``lshr``' instruction (logical shift right) returns the first
	operand shifted to the right a specified number of bits with zero fill.

	Arguments:
	""""""""""

	Both arguments to the '``lshr``' instruction must be the same
	:ref:`integer <t_integer>` or :ref:`vector <t_vector>` of integer type.
	'``op2``' is treated as an unsigned value.

	Semantics:
	""""""""""

	This instruction always performs a logical shift right operation. The
	most significant bits of the result will be filled with zero bits after
	the shift. If ``op2`` is (statically or dynamically) equal to or larger
	than the number of bits in ``op1``, this instruction returns a :ref:`poison
	value <poisonvalues>`. If the arguments are vectors, each vector element
	of ``op1`` is shifted by the corresponding shift amount in ``op2``.

	If the ``exact`` keyword is present, the result value of the ``lshr`` is
	a poison value if any of the bits shifted out are non-zero.

	Example:
	""""""""

	.. code-block:: text

	<result> = lshr i32 4, 1 ; yields i32:result = 2
	<result> = lshr i32 4, 2 ; yields i32:result = 1
	<result> = lshr i8 4, 3 ; yields i8:result = 0
	<result> = lshr i8 -2, 1 ; yields i8:result = 0x7F
	<result> = lshr i32 1, 32 ; undefined
	<result> = lshr <2 x i32> < i32 -2, i32 4>, < i32 1, i32 2> ; yields: result=<2 x i32> < i32 0x7FFFFFFF, i32 1>

	'``ashr``' Instruction
	^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	<result> = ashr <ty> <op1>, <op2> ; yields ty:result
	<result> = ashr exact <ty> <op1>, <op2> ; yields ty:result

	Overview:
	"""""""""

	The '``ashr``' instruction (arithmetic shift right) returns the first
	operand shifted to the right a specified number of bits with sign
	extension.

	Arguments:
	""""""""""

	Both arguments to the '``ashr``' instruction must be the same
	:ref:`integer <t_integer>` or :ref:`vector <t_vector>` of integer type.
	'``op2``' is treated as an unsigned value.

	Semantics:
	""""""""""

	This instruction always performs an arithmetic shift right operation,
	The most significant bits of the result will be filled with the sign bit
	of ``op1``. If ``op2`` is (statically or dynamically) equal to or larger
	than the number of bits in ``op1``, this instruction returns a :ref:`poison
	value <poisonvalues>`. If the arguments are vectors, each vector element
	of ``op1`` is shifted by the corresponding shift amount in ``op2``.

	If the ``exact`` keyword is present, the result value of the ``ashr`` is
	a poison value if any of the bits shifted out are non-zero.

	Example:
	""""""""

	.. code-block:: text

	<result> = ashr i32 4, 1 ; yields i32:result = 2
	<result> = ashr i32 4, 2 ; yields i32:result = 1
	<result> = ashr i8 4, 3 ; yields i8:result = 0
	<result> = ashr i8 -2, 1 ; yields i8:result = -1
	<result> = ashr i32 1, 32 ; undefined
	<result> = ashr <2 x i32> < i32 -2, i32 4>, < i32 1, i32 3> ; yields: result=<2 x i32> < i32 -1, i32 0>

	'``and``' Instruction
	^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	<result> = and <ty> <op1>, <op2> ; yields ty:result

	Overview:
	"""""""""

	The '``and``' instruction returns the bitwise logical and of its two
	operands.

	Arguments:
	""""""""""

	The two arguments to the '``and``' instruction must be
	:ref:`integer <t_integer>` or :ref:`vector <t_vector>` of integer values. Both
	arguments must have identical types.

	Semantics:
	""""""""""

	The truth table used for the '``and``' instruction is:

	+-----+-----+-----+
	\| In0 \| In1 \| Out \|
	+-----+-----+-----+
	\| 0 \| 0 \| 0 \|
	+-----+-----+-----+
	\| 0 \| 1 \| 0 \|
	+-----+-----+-----+
	\| 1 \| 0 \| 0 \|
	+-----+-----+-----+
	\| 1 \| 1 \| 1 \|
	+-----+-----+-----+

	Example:
	""""""""

	.. code-block:: text

	<result> = and i32 4, %var ; yields i32:result = 4 & %var
	<result> = and i32 15, 40 ; yields i32:result = 8
	<result> = and i32 4, 8 ; yields i32:result = 0

	'``or``' Instruction
	^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	<result> = or <ty> <op1>, <op2> ; yields ty:result

	Overview:
	"""""""""

	The '``or``' instruction returns the bitwise logical inclusive or of its
	two operands.

	Arguments:
	""""""""""

	The two arguments to the '``or``' instruction must be
	:ref:`integer <t_integer>` or :ref:`vector <t_vector>` of integer values. Both
	arguments must have identical types.

	Semantics:
	""""""""""

	The truth table used for the '``or``' instruction is:

	+-----+-----+-----+
	\| In0 \| In1 \| Out \|
	+-----+-----+-----+
	\| 0 \| 0 \| 0 \|
	+-----+-----+-----+
	\| 0 \| 1 \| 1 \|
	+-----+-----+-----+
	\| 1 \| 0 \| 1 \|
	+-----+-----+-----+
	\| 1 \| 1 \| 1 \|
	+-----+-----+-----+

	Example:
	""""""""

	::

	<result> = or i32 4, %var ; yields i32:result = 4 \| %var
	<result> = or i32 15, 40 ; yields i32:result = 47
	<result> = or i32 4, 8 ; yields i32:result = 12

	'``xor``' Instruction
	^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	<result> = xor <ty> <op1>, <op2> ; yields ty:result

	Overview:
	"""""""""

	The '``xor``' instruction returns the bitwise logical exclusive or of
	its two operands. The ``xor`` is used to implement the "one's
	complement" operation, which is the "~" operator in C.

	Arguments:
	""""""""""

	The two arguments to the '``xor``' instruction must be
	:ref:`integer <t_integer>` or :ref:`vector <t_vector>` of integer values. Both
	arguments must have identical types.

	Semantics:
	""""""""""

	The truth table used for the '``xor``' instruction is:

	+-----+-----+-----+
	\| In0 \| In1 \| Out \|
	+-----+-----+-----+
	\| 0 \| 0 \| 0 \|
	+-----+-----+-----+
	\| 0 \| 1 \| 1 \|
	+-----+-----+-----+
	\| 1 \| 0 \| 1 \|
	+-----+-----+-----+
	\| 1 \| 1 \| 0 \|
	+-----+-----+-----+

	Example:
	""""""""

	.. code-block:: text

	<result> = xor i32 4, %var ; yields i32:result = 4 ^ %var
	<result> = xor i32 15, 40 ; yields i32:result = 39
	<result> = xor i32 4, 8 ; yields i32:result = 12
	<result> = xor i32 %V, -1 ; yields i32:result = ~%V

	Vector Operations
	-----------------

	LLVM supports several instructions to represent vector operations in a
	target-independent manner. These instructions cover the element-access
	and vector-specific operations needed to process vectors effectively.
	While LLVM does directly support these vector operations, many
	sophisticated algorithms will want to use target-specific intrinsics to
	take full advantage of a specific target.

	.. _i_extractelement:

	'``extractelement``' Instruction
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	<result> = extractelement <n x <ty>> <val>, <ty2> <idx> ; yields <ty>

	Overview:
	"""""""""

	The '``extractelement``' instruction extracts a single scalar element
	from a vector at a specified index.

	Arguments:
	""""""""""

	The first operand of an '``extractelement``' instruction is a value of
	:ref:`vector <t_vector>` type. The second operand is an index indicating
	the position from which to extract the element. The index may be a
	variable of any integer type.

	Semantics:
	""""""""""

	The result is a scalar of the same type as the element type of ``val``.
	Its value is the value at position ``idx`` of ``val``. If ``idx``
	exceeds the length of ``val``, the results are undefined.

	Example:
	""""""""

	.. code-block:: text

	<result> = extractelement <4 x i32> %vec, i32 0 ; yields i32

	.. _i_insertelement:

	'``insertelement``' Instruction
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	<result> = insertelement <n x <ty>> <val>, <ty> <elt>, <ty2> <idx> ; yields <n x <ty>>

	Overview:
	"""""""""

	The '``insertelement``' instruction inserts a scalar element into a
	vector at a specified index.

	Arguments:
	""""""""""

	The first operand of an '``insertelement``' instruction is a value of
	:ref:`vector <t_vector>` type. The second operand is a scalar value whose
	type must equal the element type of the first operand. The third operand
	is an index indicating the position at which to insert the value. The
	index may be a variable of any integer type.

	Semantics:
	""""""""""

	The result is a vector of the same type as ``val``. Its element values
	are those of ``val`` except at position ``idx``, where it gets the value
	``elt``. If ``idx`` exceeds the length of ``val``, the results are
	undefined.

	Example:
	""""""""

	.. code-block:: text

	<result> = insertelement <4 x i32> %vec, i32 1, i32 0 ; yields <4 x i32>

	.. _i_shufflevector:

	'``shufflevector``' Instruction
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	<result> = shufflevector <n x <ty>> <v1>, <n x <ty>> <v2>, <m x i32> <mask> ; yields <m x <ty>>

	Overview:
	"""""""""

	The '``shufflevector``' instruction constructs a permutation of elements
	from two input vectors, returning a vector with the same element type as
	the input and length that is the same as the shuffle mask.

	Arguments:
	""""""""""

	The first two operands of a '``shufflevector``' instruction are vectors
	with the same type. The third argument is a shuffle mask whose element
	type is always 'i32'. The result of the instruction is a vector whose
	length is the same as the shuffle mask and whose element type is the
	same as the element type of the first two operands.

	The shuffle mask operand is required to be a constant vector with either
	constant integer or undef values.

	Semantics:
	""""""""""

	The elements of the two input vectors are numbered from left to right
	across both of the vectors. The shuffle mask operand specifies, for each
	element of the result vector, which element of the two input vectors the
	result element gets. If the shuffle mask is undef, the result vector is
	undef. If any element of the mask operand is undef, that element of the
	result is undef. If the shuffle mask selects an undef element from one
	of the input vectors, the resulting element is undef.

	Example:
	""""""""

	.. code-block:: text

	<result> = shufflevector <4 x i32> %v1, <4 x i32> %v2,
	<4 x i32> <i32 0, i32 4, i32 1, i32 5> ; yields <4 x i32>
	<result> = shufflevector <4 x i32> %v1, <4 x i32> undef,
	<4 x i32> <i32 0, i32 1, i32 2, i32 3> ; yields <4 x i32> - Identity shuffle.
	<result> = shufflevector <8 x i32> %v1, <8 x i32> undef,
	<4 x i32> <i32 0, i32 1, i32 2, i32 3> ; yields <4 x i32>
	<result> = shufflevector <4 x i32> %v1, <4 x i32> %v2,
	<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7 > ; yields <8 x i32>

	Aggregate Operations
	--------------------

	LLVM supports several instructions for working with
	:ref:`aggregate <t_aggregate>` values.

	.. _i_extractvalue:

	'``extractvalue``' Instruction
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	<result> = extractvalue <aggregate type> <val>, <idx>{, <idx>}*

	Overview:
	"""""""""

	The '``extractvalue``' instruction extracts the value of a member field
	from an :ref:`aggregate <t_aggregate>` value.

	Arguments:
	""""""""""

	The first operand of an '``extractvalue``' instruction is a value of
	:ref:`struct <t_struct>` or :ref:`array <t_array>` type. The other operands are
	constant indices to specify which value to extract in a similar manner
	as indices in a '``getelementptr``' instruction.

	The major differences to ``getelementptr`` indexing are:

	- Since the value being indexed is not a pointer, the first index is
	omitted and assumed to be zero.
	- At least one index must be specified.
	- Not only struct indices but also array indices must be in bounds.

	Semantics:
	""""""""""

	The result is the value at the position in the aggregate specified by
	the index operands.

	Example:
	""""""""

	.. code-block:: text

	<result> = extractvalue {i32, float} %agg, 0 ; yields i32

	.. _i_insertvalue:

	'``insertvalue``' Instruction
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	<result> = insertvalue <aggregate type> <val>, <ty> <elt>, <idx>{, <idx>}* ; yields <aggregate type>

	Overview:
	"""""""""

	The '``insertvalue``' instruction inserts a value into a member field in
	an :ref:`aggregate <t_aggregate>` value.

	Arguments:
	""""""""""

	The first operand of an '``insertvalue``' instruction is a value of
	:ref:`struct <t_struct>` or :ref:`array <t_array>` type. The second operand is
	a first-class value to insert. The following operands are constant
	indices indicating the position at which to insert the value in a
	similar manner as indices in a '``extractvalue``' instruction. The value
	to insert must have the same type as the value identified by the
	indices.

	Semantics:
	""""""""""

	The result is an aggregate of the same type as ``val``. Its value is
	that of ``val`` except that the value at the position specified by the
	indices is that of ``elt``.

	Example:
	""""""""

	.. code-block:: llvm

	%agg1 = insertvalue {i32, float} undef, i32 1, 0 ; yields {i32 1, float undef}
	%agg2 = insertvalue {i32, float} %agg1, float %val, 1 ; yields {i32 1, float %val}
	%agg3 = insertvalue {i32, {float}} undef, float %val, 1, 0 ; yields {i32 undef, {float %val}}

	.. _memoryops:

	Memory Access and Addressing Operations
	---------------------------------------

	A key design point of an SSA-based representation is how it represents
	memory. In LLVM, no memory locations are in SSA form, which makes things
	very simple. This section describes how to read, write, and allocate
	memory in LLVM.

	.. _i_alloca:

	'``alloca``' Instruction
	^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	<result> = alloca [inalloca] <type> [, <ty> <NumElements>] [, align <alignment>] [, addrspace(<num>)] ; yields type addrspace(num)*:result

	Overview:
	"""""""""

	The '``alloca``' instruction allocates memory on the stack frame of the
	currently executing function, to be automatically released when this
	function returns to its caller. The object is always allocated in the
	address space for allocas indicated in the datalayout.

	Arguments:
	""""""""""

	The '``alloca``' instruction allocates ``sizeof(<type>)*NumElements``
	bytes of memory on the runtime stack, returning a pointer of the
	appropriate type to the program. If "NumElements" is specified, it is
	the number of elements allocated, otherwise "NumElements" is defaulted
	to be one. If a constant alignment is specified, the value result of the
	allocation is guaranteed to be aligned to at least that boundary. The
	alignment may not be greater than ``1 << 29``. If not specified, or if
	zero, the target can choose to align the allocation on any convenient
	boundary compatible with the type.

	'``type``' may be any sized type.

	Semantics:
	""""""""""

	Memory is allocated; a pointer is returned. The operation is undefined
	if there is insufficient stack space for the allocation. '``alloca``'d
	memory is automatically released when the function returns. The
	'``alloca``' instruction is commonly used to represent automatic
	variables that must have an address available. When the function returns
	(either with the ``ret`` or ``resume`` instructions), the memory is
	reclaimed. Allocating zero bytes is legal, but the result is undefined.
	The order in which memory is allocated (ie., which way the stack grows)
	is not specified.

	Example:
	""""""""

	.. code-block:: llvm

	%ptr = alloca i32 ; yields i32*:ptr
	%ptr = alloca i32, i32 4 ; yields i32*:ptr
	%ptr = alloca i32, i32 4, align 1024 ; yields i32*:ptr
	%ptr = alloca i32, align 1024 ; yields i32*:ptr

	.. _i_load:

	'``load``' Instruction
	^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	<result> = load [volatile] <ty>, <ty>* <pointer>[, align <alignment>][, !nontemporal !<index>][, !invariant.load !<index>][, !invariant.group !<index>][, !nonnull !<index>][, !dereferenceable !<deref_bytes_node>][, !dereferenceable_or_null !<deref_bytes_node>][, !align !<align_node>]
	<result> = load atomic [volatile] <ty>, <ty>* <pointer> [syncscope("<target-scope>")] <ordering>, align <alignment> [, !invariant.group !<index>]
	!<index> = !{ i32 1 }
	!<deref_bytes_node> = !{i64 <dereferenceable_bytes>}
	!<align_node> = !{ i64 <value_alignment> }

	Overview:
	"""""""""

	The '``load``' instruction is used to read from memory.

	Arguments:
	""""""""""

	The argument to the ``load`` instruction specifies the memory address from which
	to load. The type specified must be a :ref:`first class <t_firstclass>` type of
	known size (i.e. not containing an :ref:`opaque structural type <t_opaque>`). If
	the ``load`` is marked as ``volatile``, then the optimizer is not allowed to
	modify the number or order of execution of this ``load`` with other
	:ref:`volatile operations <volatile>`.

	If the ``load`` is marked as ``atomic``, it takes an extra :ref:`ordering
	<ordering>` and optional ``syncscope("<target-scope>")`` argument. The
	``release`` and ``acq_rel`` orderings are not valid on ``load`` instructions.
	Atomic loads produce :ref:`defined <memmodel>` results when they may see
	multiple atomic stores. The type of the pointee must be an integer, pointer, or
	floating-point type whose bit width is a power of two greater than or equal to
	eight and less than or equal to a target-specific size limit. ``align`` must be
	explicitly specified on atomic loads, and the load has undefined behavior if the
	alignment is not set to a value which is at least the size in bytes of the
	pointee. ``!nontemporal`` does not have any defined semantics for atomic loads.

	The optional constant ``align`` argument specifies the alignment of the
	operation (that is, the alignment of the memory address). A value of 0
	or an omitted ``align`` argument means that the operation has the ABI
	alignment for the target. It is the responsibility of the code emitter
	to ensure that the alignment information is correct. Overestimating the
	alignment results in undefined behavior. Underestimating the alignment
	may produce less efficient code. An alignment of 1 is always safe. The
	maximum possible alignment is ``1 << 29``. An alignment value higher
	than the size of the loaded type implies memory up to the alignment
	value bytes can be safely loaded without trapping in the default
	address space. Access of the high bytes can interfere with debugging
	tools, so should not be accessed if the function has the
	``sanitize_thread`` or ``sanitize_address`` attributes.

	The optional ``!nontemporal`` metadata must reference a single
	metadata name ``<index>`` corresponding to a metadata node with one
	``i32`` entry of value 1. The existence of the ``!nontemporal``
	metadata on the instruction tells the optimizer and code generator
	that this load is not expected to be reused in the cache. The code
	generator may select special instructions to save cache bandwidth, such
	as the ``MOVNT`` instruction on x86.

	The optional ``!invariant.load`` metadata must reference a single
	metadata name ``<index>`` corresponding to a metadata node with no
	entries. If a load instruction tagged with the ``!invariant.load``
	metadata is executed, the optimizer may assume the memory location
	referenced by the load contains the same value at all points in the
	program where the memory location is known to be dereferenceable.

	The optional ``!invariant.group`` metadata must reference a single metadata name
	``<index>`` corresponding to a metadata node. See ``invariant.group`` metadata.

	The optional ``!nonnull`` metadata must reference a single
	metadata name ``<index>`` corresponding to a metadata node with no
	entries. The existence of the ``!nonnull`` metadata on the
	instruction tells the optimizer that the value loaded is known to
	never be null. This is analogous to the ``nonnull`` attribute
	on parameters and return values. This metadata can only be applied
	to loads of a pointer type.

	The optional ``!dereferenceable`` metadata must reference a single metadata
	name ``<deref_bytes_node>`` corresponding to a metadata node with one ``i64``
	entry. The existence of the ``!dereferenceable`` metadata on the instruction
	tells the optimizer that the value loaded is known to be dereferenceable.
	The number of bytes known to be dereferenceable is specified by the integer
	value in the metadata node. This is analogous to the ''dereferenceable''
	attribute on parameters and return values. This metadata can only be applied
	to loads of a pointer type.

	The optional ``!dereferenceable_or_null`` metadata must reference a single
	metadata name ``<deref_bytes_node>`` corresponding to a metadata node with one
	``i64`` entry. The existence of the ``!dereferenceable_or_null`` metadata on the
	instruction tells the optimizer that the value loaded is known to be either
	dereferenceable or null.
	The number of bytes known to be dereferenceable is specified by the integer
	value in the metadata node. This is analogous to the ''dereferenceable_or_null''
	attribute on parameters and return values. This metadata can only be applied
	to loads of a pointer type.

	The optional ``!align`` metadata must reference a single metadata name
	``<align_node>`` corresponding to a metadata node with one ``i64`` entry.
	The existence of the ``!align`` metadata on the instruction tells the
	optimizer that the value loaded is known to be aligned to a boundary specified
	by the integer value in the metadata node. The alignment must be a power of 2.
	This is analogous to the ''align'' attribute on parameters and return values.
	This metadata can only be applied to loads of a pointer type.

	Semantics:
	""""""""""

	The location of memory pointed to is loaded. If the value being loaded
	is of scalar type then the number of bytes read does not exceed the
	minimum number of bytes needed to hold all bits of the type. For
	example, loading an ``i24`` reads at most three bytes. When loading a
	value of a type like ``i20`` with a size that is not an integral number
	of bytes, the result is undefined if the value was not originally
	written using a store of the same type.

	Examples:
	"""""""""

	.. code-block:: llvm

	%ptr = alloca i32 ; yields i32*:ptr
	store i32 3, i32* %ptr ; yields void
	%val = load i32, i32* %ptr ; yields i32:val = i32 3

	.. _i_store:

	'``store``' Instruction
	^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	store [volatile] <ty> <value>, <ty>* <pointer>[, align <alignment>][, !nontemporal !<index>][, !invariant.group !<index>] ; yields void
	store atomic [volatile] <ty> <value>, <ty>* <pointer> [syncscope("<target-scope>")] <ordering>, align <alignment> [, !invariant.group !<index>] ; yields void

	Overview:
	"""""""""

	The '``store``' instruction is used to write to memory.

	Arguments:
	""""""""""

	There are two arguments to the ``store`` instruction: a value to store and an
	address at which to store it. The type of the ``<pointer>`` operand must be a
	pointer to the :ref:`first class <t_firstclass>` type of the ``<value>``
	operand. If the ``store`` is marked as ``volatile``, then the optimizer is not
	allowed to modify the number or order of execution of this ``store`` with other
	:ref:`volatile operations <volatile>`. Only values of :ref:`first class
	<t_firstclass>` types of known size (i.e. not containing an :ref:`opaque
	structural type <t_opaque>`) can be stored.

	If the ``store`` is marked as ``atomic``, it takes an extra :ref:`ordering
	<ordering>` and optional ``syncscope("<target-scope>")`` argument. The
	``acquire`` and ``acq_rel`` orderings aren't valid on ``store`` instructions.
	Atomic loads produce :ref:`defined <memmodel>` results when they may see
	multiple atomic stores. The type of the pointee must be an integer, pointer, or
	floating-point type whose bit width is a power of two greater than or equal to
	eight and less than or equal to a target-specific size limit. ``align`` must be
	explicitly specified on atomic stores, and the store has undefined behavior if
	the alignment is not set to a value which is at least the size in bytes of the
	pointee. ``!nontemporal`` does not have any defined semantics for atomic stores.

	The optional constant ``align`` argument specifies the alignment of the
	operation (that is, the alignment of the memory address). A value of 0
	or an omitted ``align`` argument means that the operation has the ABI
	alignment for the target. It is the responsibility of the code emitter
	to ensure that the alignment information is correct. Overestimating the
	alignment results in undefined behavior. Underestimating the
	alignment may produce less efficient code. An alignment of 1 is always
	safe. The maximum possible alignment is ``1 << 29``. An alignment
	value higher than the size of the stored type implies memory up to the
	alignment value bytes can be stored to without trapping in the default
	address space. Storing to the higher bytes however may result in data
	races if another thread can access the same address. Introducing a
	data race is not allowed. Storing to the extra bytes is not allowed
	even in situations where a data race is known to not exist if the
	function has the ``sanitize_address`` attribute.

	The optional ``!nontemporal`` metadata must reference a single metadata
	name ``<index>`` corresponding to a metadata node with one ``i32`` entry of
	value 1. The existence of the ``!nontemporal`` metadata on the instruction
	tells the optimizer and code generator that this load is not expected to
	be reused in the cache. The code generator may select special
	instructions to save cache bandwidth, such as the ``MOVNT`` instruction on
	x86.

	The optional ``!invariant.group`` metadata must reference a
	single metadata name ``<index>``. See ``invariant.group`` metadata.

	Semantics:
	""""""""""

	The contents of memory are updated to contain ``<value>`` at the
	location specified by the ``<pointer>`` operand. If ``<value>`` is
	of scalar type then the number of bytes written does not exceed the
	minimum number of bytes needed to hold all bits of the type. For
	example, storing an ``i24`` writes at most three bytes. When writing a
	value of a type like ``i20`` with a size that is not an integral number
	of bytes, it is unspecified what happens to the extra bits that do not
	belong to the type, but they will typically be overwritten.

	Example:
	""""""""

	.. code-block:: llvm

	%ptr = alloca i32 ; yields i32*:ptr
	store i32 3, i32* %ptr ; yields void
	%val = load i32, i32* %ptr ; yields i32:val = i32 3

	.. _i_fence:

	'``fence``' Instruction
	^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	fence [syncscope("<target-scope>")] <ordering> ; yields void

	Overview:
	"""""""""

	The '``fence``' instruction is used to introduce happens-before edges
	between operations.

	Arguments:
	""""""""""

	'``fence``' instructions take an :ref:`ordering <ordering>` argument which
	defines what synchronizes-with edges they add. They can only be given
	``acquire``, ``release``, ``acq_rel``, and ``seq_cst`` orderings.

	Semantics:
	""""""""""

	A fence A which has (at least) ``release`` ordering semantics
	synchronizes with a fence B with (at least) ``acquire`` ordering
	semantics if and only if there exist atomic operations X and Y, both
	operating on some atomic object M, such that A is sequenced before X, X
	modifies M (either directly or through some side effect of a sequence
	headed by X), Y is sequenced before B, and Y observes M. This provides a
	happens-before dependency between A and B. Rather than an explicit
	``fence``, one (but not both) of the atomic operations X or Y might
	provide a ``release`` or ``acquire`` (resp.) ordering constraint and
	still synchronize-with the explicit ``fence`` and establish the
	happens-before edge.

	A ``fence`` which has ``seq_cst`` ordering, in addition to having both
	``acquire`` and ``release`` semantics specified above, participates in
	the global program order of other ``seq_cst`` operations and/or fences.

	A ``fence`` instruction can also take an optional
	":ref:`syncscope <syncscope>`" argument.

	Example:
	""""""""

	.. code-block:: llvm

	fence acquire ; yields void
	fence syncscope("singlethread") seq_cst ; yields void
	fence syncscope("agent") seq_cst ; yields void

	.. _i_cmpxchg:

	'``cmpxchg``' Instruction
	^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	cmpxchg [weak] [volatile] <ty>* <pointer>, <ty> <cmp>, <ty> <new> [syncscope("<target-scope>")] <success ordering> <failure ordering> ; yields { ty, i1 }

	Overview:
	"""""""""

	The '``cmpxchg``' instruction is used to atomically modify memory. It
	loads a value in memory and compares it to a given value. If they are
	equal, it tries to store a new value into the memory.

	Arguments:
	""""""""""

	There are three arguments to the '``cmpxchg``' instruction: an address
	to operate on, a value to compare to the value currently be at that
	address, and a new value to place at that address if the compared values
	are equal. The type of '<cmp>' must be an integer or pointer type whose
	bit width is a power of two greater than or equal to eight and less
	than or equal to a target-specific size limit. '<cmp>' and '<new>' must
	have the same type, and the type of '<pointer>' must be a pointer to
	that type. If the ``cmpxchg`` is marked as ``volatile``, then the
	optimizer is not allowed to modify the number or order of execution of
	this ``cmpxchg`` with other :ref:`volatile operations <volatile>`.

	The success and failure :ref:`ordering <ordering>` arguments specify how this
	``cmpxchg`` synchronizes with other atomic operations. Both ordering parameters
	must be at least ``monotonic``, the ordering constraint on failure must be no
	stronger than that on success, and the failure ordering cannot be either
	``release`` or ``acq_rel``.

	A ``cmpxchg`` instruction can also take an optional
	":ref:`syncscope <syncscope>`" argument.

	The pointer passed into cmpxchg must have alignment greater than or
	equal to the size in memory of the operand.

	Semantics:
	""""""""""

	The contents of memory at the location specified by the '``<pointer>``' operand
	is read and compared to '``<cmp>``'; if the read value is the equal, the
	'``<new>``' is written. The original value at the location is returned, together
	with a flag indicating success (true) or failure (false).

	If the cmpxchg operation is marked as ``weak`` then a spurious failure is
	permitted: the operation may not write ``<new>`` even if the comparison
	matched.

	If the cmpxchg operation is strong (the default), the i1 value is 1 if and only
	if the value loaded equals ``cmp``.

	A successful ``cmpxchg`` is a read-modify-write instruction for the purpose of
	identifying release sequences. A failed ``cmpxchg`` is equivalent to an atomic
	load with an ordering parameter determined the second ordering parameter.

	Example:
	""""""""

	.. code-block:: llvm

	entry:
	%orig = load atomic i32, i32* %ptr unordered, align 4 ; yields i32
	br label %loop

	loop:
	%cmp = phi i32 [ %orig, %entry ], [%value_loaded, %loop]
	%squared = mul i32 %cmp, %cmp
	%val_success = cmpxchg i32* %ptr, i32 %cmp, i32 %squared acq_rel monotonic ; yields { i32, i1 }
	%value_loaded = extractvalue { i32, i1 } %val_success, 0
	%success = extractvalue { i32, i1 } %val_success, 1
	br i1 %success, label %done, label %loop

	done:
	...

	.. _i_atomicrmw:

	'``atomicrmw``' Instruction
	^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	atomicrmw [volatile] <operation> <ty>* <pointer>, <ty> <value> [syncscope("<target-scope>")] <ordering> ; yields ty

	Overview:
	"""""""""

	The '``atomicrmw``' instruction is used to atomically modify memory.

	Arguments:
	""""""""""

	There are three arguments to the '``atomicrmw``' instruction: an
	operation to apply, an address whose value to modify, an argument to the
	operation. The operation must be one of the following keywords:

	- xchg
	- add
	- sub
	- and
	- nand
	- or
	- xor
	- max
	- min
	- umax
	- umin

	The type of '<value>' must be an integer type whose bit width is a power
	of two greater than or equal to eight and less than or equal to a
	target-specific size limit. The type of the '``<pointer>``' operand must
	be a pointer to that type. If the ``atomicrmw`` is marked as
	``volatile``, then the optimizer is not allowed to modify the number or
	order of execution of this ``atomicrmw`` with other :ref:`volatile
	operations <volatile>`.

	A ``atomicrmw`` instruction can also take an optional
	":ref:`syncscope <syncscope>`" argument.

	Semantics:
	""""""""""

	The contents of memory at the location specified by the '``<pointer>``'
	operand are atomically read, modified, and written back. The original
	value at the location is returned. The modification is specified by the
	operation argument:

	- xchg: ``*ptr = val``
	- add: ``ptr = ptr + val``
	- sub: ``ptr = ptr - val``
	- and: ``ptr = ptr & val``
	- nand: ``ptr = ~(ptr & val)``
	- or: ``ptr = ptr \| val``
	- xor: ``ptr = ptr ^ val``
	- max: ``ptr = ptr > val ? *ptr : val`` (using a signed comparison)
	- min: ``ptr = ptr < val ? *ptr : val`` (using a signed comparison)
	- umax: ``ptr = ptr > val ? *ptr : val`` (using an unsigned
	comparison)
	- umin: ``ptr = ptr < val ? *ptr : val`` (using an unsigned
	comparison)

	Example:
	""""""""

	.. code-block:: llvm

	%old = atomicrmw add i32* %ptr, i32 1 acquire ; yields i32

	.. _i_getelementptr:

	'``getelementptr``' Instruction
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	<result> = getelementptr <ty>, <ty>* <ptrval>{, [inrange] <ty> <idx>}*
	<result> = getelementptr inbounds <ty>, <ty>* <ptrval>{, [inrange] <ty> <idx>}*
	<result> = getelementptr <ty>, <ptr vector> <ptrval>, [inrange] <vector index type> <idx>

	Overview:
	"""""""""

	The '``getelementptr``' instruction is used to get the address of a
	subelement of an :ref:`aggregate <t_aggregate>` data structure. It performs
	address calculation only and does not access memory. The instruction can also
	be used to calculate a vector of such addresses.

	Arguments:
	""""""""""

	The first argument is always a type used as the basis for the calculations.
	The second argument is always a pointer or a vector of pointers, and is the
	base address to start from. The remaining arguments are indices
	that indicate which of the elements of the aggregate object are indexed.
	The interpretation of each index is dependent on the type being indexed
	into. The first index always indexes the pointer value given as the
	second argument, the second index indexes a value of the type pointed to
	(not necessarily the value directly pointed to, since the first index
	can be non-zero), etc. The first type indexed into must be a pointer
	value, subsequent types can be arrays, vectors, and structs. Note that
	subsequent types being indexed into can never be pointers, since that
	would require loading the pointer before continuing calculation.

	The type of each index argument depends on the type it is indexing into.
	When indexing into a (optionally packed) structure, only ``i32`` integer
	constants are allowed (when using a vector of indices they must all
	be the same ``i32`` integer constant). When indexing into an array,
	pointer or vector, integers of any width are allowed, and they are not
	required to be constant. These integers are treated as signed values
	where relevant.

	For example, let's consider a C code fragment and how it gets compiled
	to LLVM:

	.. code-block:: c

	struct RT {
	char A;
	int B[10][20];
	char C;
	};
	struct ST {
	int X;
	double Y;
	struct RT Z;
	};

	int foo(struct ST s) {
	return &s[1].Z.B[5][13];
	}

	The LLVM code generated by Clang is:

	.. code-block:: llvm

	%struct.RT = type { i8, [10 x [20 x i32]], i8 }
	%struct.ST = type { i32, double, %struct.RT }

	define i32* @foo(%struct.ST* %s) nounwind uwtable readnone optsize ssp {
	entry:
	%arrayidx = getelementptr inbounds %struct.ST, %struct.ST* %s, i64 1, i32 2, i32 1, i64 5, i64 13
	ret i32* %arrayidx
	}

	Semantics:
	""""""""""

	In the example above, the first index is indexing into the
	'``%struct.ST*``' type, which is a pointer, yielding a '``%struct.ST``'
	= '``{ i32, double, %struct.RT }``' type, a structure. The second index
	indexes into the third element of the structure, yielding a
	'``%struct.RT``' = '``{ i8 , [10 x [20 x i32]], i8 }``' type, another
	structure. The third index indexes into the second element of the
	structure, yielding a '``[10 x [20 x i32]]``' type, an array. The two
	dimensions of the array are subscripted into, yielding an '``i32``'
	type. The '``getelementptr``' instruction returns a pointer to this
	element, thus computing a value of '``i32*``' type.

	Note that it is perfectly legal to index partially through a structure,
	returning a pointer to an inner element. Because of this, the LLVM code
	for the given testcase is equivalent to:

	.. code-block:: llvm

	define i32* @foo(%struct.ST* %s) {
	%t1 = getelementptr %struct.ST, %struct.ST* %s, i32 1 ; yields %struct.ST*:%t1
	%t2 = getelementptr %struct.ST, %struct.ST* %t1, i32 0, i32 2 ; yields %struct.RT*:%t2
	%t3 = getelementptr %struct.RT, %struct.RT* %t2, i32 0, i32 1 ; yields [10 x [20 x i32]]*:%t3
	%t4 = getelementptr [10 x [20 x i32]], [10 x [20 x i32]]* %t3, i32 0, i32 5 ; yields [20 x i32]*:%t4
	%t5 = getelementptr [20 x i32], [20 x i32]* %t4, i32 0, i32 13 ; yields i32*:%t5
	ret i32* %t5
	}

	If the ``inbounds`` keyword is present, the result value of the
	``getelementptr`` is a :ref:`poison value <poisonvalues>` if the base
	pointer is not an in bounds address of an allocated object, or if any
	of the addresses that would be formed by successive addition of the
	offsets implied by the indices to the base address with infinitely
	precise signed arithmetic are not an in bounds address of that
	allocated object. The in bounds addresses for an allocated object are
	all the addresses that point into the object, plus the address one byte
	past the end. The only in bounds address for a null pointer in the
	default address-space is the null pointer itself. In cases where the
	base is a vector of pointers the ``inbounds`` keyword applies to each
	of the computations element-wise.

	If the ``inbounds`` keyword is not present, the offsets are added to the
	base address with silently-wrapping two's complement arithmetic. If the
	offsets have a different width from the pointer, they are sign-extended
	or truncated to the width of the pointer. The result value of the
	``getelementptr`` may be outside the object pointed to by the base
	pointer. The result value may not necessarily be used to access memory
	though, even if it happens to point into allocated storage. See the
	:ref:`Pointer Aliasing Rules <pointeraliasing>` section for more
	information.

	If the ``inrange`` keyword is present before any index, loading from or
	storing to any pointer derived from the ``getelementptr`` has undefined
	behavior if the load or store would access memory outside of the bounds of
	the element selected by the index marked as ``inrange``. The result of a
	pointer comparison or ``ptrtoint`` (including ``ptrtoint``-like operations
	involving memory) involving a pointer derived from a ``getelementptr`` with
	the ``inrange`` keyword is undefined, with the exception of comparisons
	in the case where both operands are in the range of the element selected
	by the ``inrange`` keyword, inclusive of the address one past the end of
	that element. Note that the ``inrange`` keyword is currently only allowed
	in constant ``getelementptr`` expressions.

	The getelementptr instruction is often confusing. For some more insight
	into how it works, see :doc:`the getelementptr FAQ <GetElementPtr>`.

	Example:
	""""""""

	.. code-block:: llvm

	; yields [12 x i8]*:aptr
	%aptr = getelementptr {i32, [12 x i8]}, {i32, [12 x i8]}* %saptr, i64 0, i32 1
	; yields i8*:vptr
	%vptr = getelementptr {i32, <2 x i8>}, {i32, <2 x i8>}* %svptr, i64 0, i32 1, i32 1
	; yields i8*:eptr
	%eptr = getelementptr [12 x i8], [12 x i8]* %aptr, i64 0, i32 1
	; yields i32*:iptr
	%iptr = getelementptr [10 x i32], [10 x i32]* @arr, i16 0, i16 0

	Vector of pointers:
	"""""""""""""""""""

	The ``getelementptr`` returns a vector of pointers, instead of a single address,
	when one or more of its arguments is a vector. In such cases, all vector
	arguments should have the same number of elements, and every scalar argument
	will be effectively broadcast into a vector during address calculation.

	.. code-block:: llvm

	; All arguments are vectors:
	; A[i] = ptrs[i] + offsets[i]*sizeof(i8)
	%A = getelementptr i8, <4 x i8*> %ptrs, <4 x i64> %offsets

	; Add the same scalar offset to each pointer of a vector:
	; A[i] = ptrs[i] + offset*sizeof(i8)
	%A = getelementptr i8, <4 x i8*> %ptrs, i64 %offset

	; Add distinct offsets to the same pointer:
	; A[i] = ptr + offsets[i]*sizeof(i8)
	%A = getelementptr i8, i8* %ptr, <4 x i64> %offsets

	; In all cases described above the type of the result is <4 x i8*>

	The two following instructions are equivalent:

	.. code-block:: llvm

	getelementptr %struct.ST, <4 x %struct.ST*> %s, <4 x i64> %ind1,
	<4 x i32> <i32 2, i32 2, i32 2, i32 2>,
	<4 x i32> <i32 1, i32 1, i32 1, i32 1>,
	<4 x i32> %ind4,
	<4 x i64> <i64 13, i64 13, i64 13, i64 13>

	getelementptr %struct.ST, <4 x %struct.ST*> %s, <4 x i64> %ind1,
	i32 2, i32 1, <4 x i32> %ind4, i64 13

	Let's look at the C code, where the vector version of ``getelementptr``
	makes sense:

	.. code-block:: c

	// Let's assume that we vectorize the following loop:
	double A, B; int *C;
	for (int i = 0; i < size; ++i) {
	A[i] = B[C[i]];
	}

	.. code-block:: llvm

	; get pointers for 8 elements from array B
	%ptrs = getelementptr double, double* %B, <8 x i32> %C
	; load 8 elements from array B into A
	%A = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> %ptrs,
	i32 8, <8 x i1> %mask, <8 x double> %passthru)

	Conversion Operations
	---------------------

	The instructions in this category are the conversion instructions
	(casting) which all take a single operand and a type. They perform
	various bit conversions on the operand.

	'``trunc .. to``' Instruction
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	<result> = trunc <ty> <value> to <ty2> ; yields ty2

	Overview:
	"""""""""

	The '``trunc``' instruction truncates its operand to the type ``ty2``.

	Arguments:
	""""""""""

	The '``trunc``' instruction takes a value to trunc, and a type to trunc
	it to. Both types must be of :ref:`integer <t_integer>` types, or vectors
	of the same number of integers. The bit size of the ``value`` must be
	larger than the bit size of the destination type, ``ty2``. Equal sized
	types are not allowed.

	Semantics:
	""""""""""

	The '``trunc``' instruction truncates the high order bits in ``value``
	and converts the remaining bits to ``ty2``. Since the source size must
	be larger than the destination size, ``trunc`` cannot be a no-op cast.
	It will always truncate bits.

	Example:
	""""""""

	.. code-block:: llvm

	%X = trunc i32 257 to i8 ; yields i8:1
	%Y = trunc i32 123 to i1 ; yields i1:true
	%Z = trunc i32 122 to i1 ; yields i1:false
	%W = trunc <2 x i16> <i16 8, i16 7> to <2 x i8> ; yields <i8 8, i8 7>

	'``zext .. to``' Instruction
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	<result> = zext <ty> <value> to <ty2> ; yields ty2

	Overview:
	"""""""""

	The '``zext``' instruction zero extends its operand to type ``ty2``.

	Arguments:
	""""""""""

	The '``zext``' instruction takes a value to cast, and a type to cast it
	to. Both types must be of :ref:`integer <t_integer>` types, or vectors of
	the same number of integers. The bit size of the ``value`` must be
	smaller than the bit size of the destination type, ``ty2``.

	Semantics:
	""""""""""

	The ``zext`` fills the high order bits of the ``value`` with zero bits
	until it reaches the size of the destination type, ``ty2``.

	When zero extending from i1, the result will always be either 0 or 1.

	Example:
	""""""""

	.. code-block:: llvm

	%X = zext i32 257 to i64 ; yields i64:257
	%Y = zext i1 true to i32 ; yields i32:1
	%Z = zext <2 x i16> <i16 8, i16 7> to <2 x i32> ; yields <i32 8, i32 7>

	'``sext .. to``' Instruction
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	<result> = sext <ty> <value> to <ty2> ; yields ty2

	Overview:
	"""""""""

	The '``sext``' sign extends ``value`` to the type ``ty2``.

	Arguments:
	""""""""""

	The '``sext``' instruction takes a value to cast, and a type to cast it
	to. Both types must be of :ref:`integer <t_integer>` types, or vectors of
	the same number of integers. The bit size of the ``value`` must be
	smaller than the bit size of the destination type, ``ty2``.

	Semantics:
	""""""""""

	The '``sext``' instruction performs a sign extension by copying the sign
	bit (highest order bit) of the ``value`` until it reaches the bit size
	of the type ``ty2``.

	When sign extending from i1, the extension always results in -1 or 0.

	Example:
	""""""""

	.. code-block:: llvm

	%X = sext i8 -1 to i16 ; yields i16 :65535
	%Y = sext i1 true to i32 ; yields i32:-1
	%Z = sext <2 x i16> <i16 8, i16 7> to <2 x i32> ; yields <i32 8, i32 7>

	'``fptrunc .. to``' Instruction
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	<result> = fptrunc <ty> <value> to <ty2> ; yields ty2

	Overview:
	"""""""""

	The '``fptrunc``' instruction truncates ``value`` to type ``ty2``.

	Arguments:
	""""""""""

	The '``fptrunc``' instruction takes a :ref:`floating point <t_floating>`
	value to cast and a :ref:`floating point <t_floating>` type to cast it to.
	The size of ``value`` must be larger than the size of ``ty2``. This
	implies that ``fptrunc`` cannot be used to make a no-op cast.

	Semantics:
	""""""""""

	The '``fptrunc``' instruction casts a ``value`` from a larger
	:ref:`floating point <t_floating>` type to a smaller :ref:`floating
	point <t_floating>` type. If the value cannot fit (i.e. overflows) within the
	destination type, ``ty2``, then the results are undefined. If the cast produces
	an inexact result, how rounding is performed (e.g. truncation, also known as
	round to zero) is undefined.

	Example:
	""""""""

	.. code-block:: llvm

	%X = fptrunc double 123.0 to float ; yields float:123.0
	%Y = fptrunc double 1.0E+300 to float ; yields undefined

	'``fpext .. to``' Instruction
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	<result> = fpext <ty> <value> to <ty2> ; yields ty2

	Overview:
	"""""""""

	The '``fpext``' extends a floating point ``value`` to a larger floating
	point value.

	Arguments:
	""""""""""

	The '``fpext``' instruction takes a :ref:`floating point <t_floating>`
	``value`` to cast, and a :ref:`floating point <t_floating>` type to cast it
	to. The source type must be smaller than the destination type.

	Semantics:
	""""""""""

	The '``fpext``' instruction extends the ``value`` from a smaller
	:ref:`floating point <t_floating>` type to a larger :ref:`floating
	point <t_floating>` type. The ``fpext`` cannot be used to make a
	no-op cast because it always changes bits. Use ``bitcast`` to make a
	no-op cast for a floating point cast.

	Example:
	""""""""

	.. code-block:: llvm

	%X = fpext float 3.125 to double ; yields double:3.125000e+00
	%Y = fpext double %X to fp128 ; yields fp128:0xL00000000000000004000900000000000

	'``fptoui .. to``' Instruction
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	<result> = fptoui <ty> <value> to <ty2> ; yields ty2

	Overview:
	"""""""""

	The '``fptoui``' converts a floating point ``value`` to its unsigned
	integer equivalent of type ``ty2``.

	Arguments:
	""""""""""

	The '``fptoui``' instruction takes a value to cast, which must be a
	scalar or vector :ref:`floating point <t_floating>` value, and a type to
	cast it to ``ty2``, which must be an :ref:`integer <t_integer>` type. If
	``ty`` is a vector floating point type, ``ty2`` must be a vector integer
	type with the same number of elements as ``ty``

	Semantics:
	""""""""""

	The '``fptoui``' instruction converts its :ref:`floating
	point <t_floating>` operand into the nearest (rounding towards zero)
	unsigned integer value. If the value cannot fit in ``ty2``, the results
	are undefined.

	Example:
	""""""""

	.. code-block:: llvm

	%X = fptoui double 123.0 to i32 ; yields i32:123
	%Y = fptoui float 1.0E+300 to i1 ; yields undefined:1
	%Z = fptoui float 1.04E+17 to i8 ; yields undefined:1

	'``fptosi .. to``' Instruction
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	<result> = fptosi <ty> <value> to <ty2> ; yields ty2

	Overview:
	"""""""""

	The '``fptosi``' instruction converts :ref:`floating point <t_floating>`
	``value`` to type ``ty2``.

	Arguments:
	""""""""""

	The '``fptosi``' instruction takes a value to cast, which must be a
	scalar or vector :ref:`floating point <t_floating>` value, and a type to
	cast it to ``ty2``, which must be an :ref:`integer <t_integer>` type. If
	``ty`` is a vector floating point type, ``ty2`` must be a vector integer
	type with the same number of elements as ``ty``

	Semantics:
	""""""""""

	The '``fptosi``' instruction converts its :ref:`floating
	point <t_floating>` operand into the nearest (rounding towards zero)
	signed integer value. If the value cannot fit in ``ty2``, the results
	are undefined.

	Example:
	""""""""

	.. code-block:: llvm

	%X = fptosi double -123.0 to i32 ; yields i32:-123
	%Y = fptosi float 1.0E-247 to i1 ; yields undefined:1
	%Z = fptosi float 1.04E+17 to i8 ; yields undefined:1

	'``uitofp .. to``' Instruction
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	<result> = uitofp <ty> <value> to <ty2> ; yields ty2

	Overview:
	"""""""""

	The '``uitofp``' instruction regards ``value`` as an unsigned integer
	and converts that value to the ``ty2`` type.

	Arguments:
	""""""""""

	The '``uitofp``' instruction takes a value to cast, which must be a
	scalar or vector :ref:`integer <t_integer>` value, and a type to cast it to
	``ty2``, which must be an :ref:`floating point <t_floating>` type. If
	``ty`` is a vector integer type, ``ty2`` must be a vector floating point
	type with the same number of elements as ``ty``

	Semantics:
	""""""""""

	The '``uitofp``' instruction interprets its operand as an unsigned
	integer quantity and converts it to the corresponding floating point
	value. If the value cannot fit in the floating point value, the results
	are undefined.

	Example:
	""""""""

	.. code-block:: llvm

	%X = uitofp i32 257 to float ; yields float:257.0
	%Y = uitofp i8 -1 to double ; yields double:255.0

	'``sitofp .. to``' Instruction
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	<result> = sitofp <ty> <value> to <ty2> ; yields ty2

	Overview:
	"""""""""

	The '``sitofp``' instruction regards ``value`` as a signed integer and
	converts that value to the ``ty2`` type.

	Arguments:
	""""""""""

	The '``sitofp``' instruction takes a value to cast, which must be a
	scalar or vector :ref:`integer <t_integer>` value, and a type to cast it to
	``ty2``, which must be an :ref:`floating point <t_floating>` type. If
	``ty`` is a vector integer type, ``ty2`` must be a vector floating point
	type with the same number of elements as ``ty``

	Semantics:
	""""""""""

	The '``sitofp``' instruction interprets its operand as a signed integer
	quantity and converts it to the corresponding floating point value. If
	the value cannot fit in the floating point value, the results are
	undefined.

	Example:
	""""""""

	.. code-block:: llvm

	%X = sitofp i32 257 to float ; yields float:257.0
	%Y = sitofp i8 -1 to double ; yields double:-1.0

	.. _i_ptrtoint:

	'``ptrtoint .. to``' Instruction
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	<result> = ptrtoint <ty> <value> to <ty2> ; yields ty2

	Overview:
	"""""""""

	The '``ptrtoint``' instruction converts the pointer or a vector of
	pointers ``value`` to the integer (or vector of integers) type ``ty2``.

	Arguments:
	""""""""""

	The '``ptrtoint``' instruction takes a ``value`` to cast, which must be
	a value of type :ref:`pointer <t_pointer>` or a vector of pointers, and a
	type to cast it to ``ty2``, which must be an :ref:`integer <t_integer>` or
	a vector of integers type.

	Semantics:
	""""""""""

	The '``ptrtoint``' instruction converts ``value`` to integer type
	``ty2`` by interpreting the pointer value as an integer and either
	truncating or zero extending that value to the size of the integer type.
	If ``value`` is smaller than ``ty2`` then a zero extension is done. If
	``value`` is larger than ``ty2`` then a truncation is done. If they are
	the same size, then nothing is done (no-op cast) other than a type
	change.

	Example:
	""""""""

	.. code-block:: llvm

	%X = ptrtoint i32* %P to i8 ; yields truncation on 32-bit architecture
	%Y = ptrtoint i32* %P to i64 ; yields zero extension on 32-bit architecture
	%Z = ptrtoint <4 x i32*> %P to <4 x i64>; yields vector zero extension for a vector of addresses on 32-bit architecture

	.. _i_inttoptr:

	'``inttoptr .. to``' Instruction
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	<result> = inttoptr <ty> <value> to <ty2> ; yields ty2

	Overview:
	"""""""""

	The '``inttoptr``' instruction converts an integer ``value`` to a
	pointer type, ``ty2``.

	Arguments:
	""""""""""

	The '``inttoptr``' instruction takes an :ref:`integer <t_integer>` value to
	cast, and a type to cast it to, which must be a :ref:`pointer <t_pointer>`
	type.

	Semantics:
	""""""""""

	The '``inttoptr``' instruction converts ``value`` to type ``ty2`` by
	applying either a zero extension or a truncation depending on the size
	of the integer ``value``. If ``value`` is larger than the size of a
	pointer then a truncation is done. If ``value`` is smaller than the size
	of a pointer then a zero extension is done. If they are the same size,
	nothing is done (no-op cast).

	Example:
	""""""""

	.. code-block:: llvm

	%X = inttoptr i32 255 to i32* ; yields zero extension on 64-bit architecture
	%Y = inttoptr i32 255 to i32* ; yields no-op on 32-bit architecture
	%Z = inttoptr i64 0 to i32* ; yields truncation on 32-bit architecture
	%Z = inttoptr <4 x i32> %G to <4 x i8*>; yields truncation of vector G to four pointers

	.. _i_bitcast:

	'``bitcast .. to``' Instruction
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	<result> = bitcast <ty> <value> to <ty2> ; yields ty2

	Overview:
	"""""""""

	The '``bitcast``' instruction converts ``value`` to type ``ty2`` without
	changing any bits.

	Arguments:
	""""""""""

	The '``bitcast``' instruction takes a value to cast, which must be a
	non-aggregate first class value, and a type to cast it to, which must
	also be a non-aggregate :ref:`first class <t_firstclass>` type. The
	bit sizes of ``value`` and the destination type, ``ty2``, must be
	identical. If the source type is a pointer, the destination type must
	also be a pointer of the same size. This instruction supports bitwise
	conversion of vectors to integers and to vectors of other types (as
	long as they have the same size).

	Semantics:
	""""""""""

	The '``bitcast``' instruction converts ``value`` to type ``ty2``. It
	is always a no-op cast because no bits change with this
	conversion. The conversion is done as if the ``value`` had been stored
	to memory and read back as type ``ty2``. Pointer (or vector of
	pointers) types may only be converted to other pointer (or vector of
	pointers) types with the same address space through this instruction.
	To convert pointers to other types, use the :ref:`inttoptr <i_inttoptr>`
	or :ref:`ptrtoint <i_ptrtoint>` instructions first.

	Example:
	""""""""

	.. code-block:: text

	%X = bitcast i8 255 to i8 ; yields i8 :-1
	%Y = bitcast i32* %x to sint* ; yields sint*:%x
	%Z = bitcast <2 x int> %V to i64; ; yields i64: %V
	%Z = bitcast <2 x i32> %V to <2 x i64> ; yields <2 x i64*>

	.. _i_addrspacecast:

	'``addrspacecast .. to``' Instruction
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	<result> = addrspacecast <pty> <ptrval> to <pty2> ; yields pty2

	Overview:
	"""""""""

	The '``addrspacecast``' instruction converts ``ptrval`` from ``pty`` in
	address space ``n`` to type ``pty2`` in address space ``m``.

	Arguments:
	""""""""""

	The '``addrspacecast``' instruction takes a pointer or vector of pointer value
	to cast and a pointer type to cast it to, which must have a different
	address space.

	Semantics:
	""""""""""

	The '``addrspacecast``' instruction converts the pointer value
	``ptrval`` to type ``pty2``. It can be a no-op cast or a complex
	value modification, depending on the target and the address space
	pair. Pointer conversions within the same address space must be
	performed with the ``bitcast`` instruction. Note that if the address space
	conversion is legal then both result and operand refer to the same memory
	location.

	Example:
	""""""""

	.. code-block:: llvm

	%X = addrspacecast i32* %x to i32 addrspace(1)* ; yields i32 addrspace(1)*:%x
	%Y = addrspacecast i32 addrspace(1)* %y to i64 addrspace(2)* ; yields i64 addrspace(2)*:%y
	%Z = addrspacecast <4 x i32> %z to <4 x float addrspace(3)> ; yields <4 x float addrspace(3)*>:%z

	.. _otherops:

	Other Operations
	----------------

	The instructions in this category are the "miscellaneous" instructions,
	which defy better classification.

	.. _i_icmp:

	'``icmp``' Instruction
	^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	<result> = icmp <cond> <ty> <op1>, <op2> ; yields i1 or <N x i1>:result

	Overview:
	"""""""""

	The '``icmp``' instruction returns a boolean value or a vector of
	boolean values based on comparison of its two integer, integer vector,
	pointer, or pointer vector operands.

	Arguments:
	""""""""""

	The '``icmp``' instruction takes three operands. The first operand is
	the condition code indicating the kind of comparison to perform. It is
	not a value, just a keyword. The possible condition codes are:

	#. ``eq``: equal
	#. ``ne``: not equal
	#. ``ugt``: unsigned greater than
	#. ``uge``: unsigned greater or equal
	#. ``ult``: unsigned less than
	#. ``ule``: unsigned less or equal
	#. ``sgt``: signed greater than
	#. ``sge``: signed greater or equal
	#. ``slt``: signed less than
	#. ``sle``: signed less or equal

	The remaining two arguments must be :ref:`integer <t_integer>` or
	:ref:`pointer <t_pointer>` or integer :ref:`vector <t_vector>` typed. They
	must also be identical types.

	Semantics:
	""""""""""

	The '``icmp``' compares ``op1`` and ``op2`` according to the condition
	code given as ``cond``. The comparison performed always yields either an
	:ref:`i1 <t_integer>` or vector of ``i1`` result, as follows:

	#. ``eq``: yields ``true`` if the operands are equal, ``false``
	otherwise. No sign interpretation is necessary or performed.
	#. ``ne``: yields ``true`` if the operands are unequal, ``false``
	otherwise. No sign interpretation is necessary or performed.
	#. ``ugt``: interprets the operands as unsigned values and yields
	``true`` if ``op1`` is greater than ``op2``.
	#. ``uge``: interprets the operands as unsigned values and yields
	``true`` if ``op1`` is greater than or equal to ``op2``.
	#. ``ult``: interprets the operands as unsigned values and yields
	``true`` if ``op1`` is less than ``op2``.
	#. ``ule``: interprets the operands as unsigned values and yields
	``true`` if ``op1`` is less than or equal to ``op2``.
	#. ``sgt``: interprets the operands as signed values and yields ``true``
	if ``op1`` is greater than ``op2``.
	#. ``sge``: interprets the operands as signed values and yields ``true``
	if ``op1`` is greater than or equal to ``op2``.
	#. ``slt``: interprets the operands as signed values and yields ``true``
	if ``op1`` is less than ``op2``.
	#. ``sle``: interprets the operands as signed values and yields ``true``
	if ``op1`` is less than or equal to ``op2``.

	If the operands are :ref:`pointer <t_pointer>` typed, the pointer values
	are compared as if they were integers.

	If the operands are integer vectors, then they are compared element by
	element. The result is an ``i1`` vector with the same number of elements
	as the values being compared. Otherwise, the result is an ``i1``.

	Example:
	""""""""

	.. code-block:: text

	<result> = icmp eq i32 4, 5 ; yields: result=false
	<result> = icmp ne float* %X, %X ; yields: result=false
	<result> = icmp ult i16 4, 5 ; yields: result=true
	<result> = icmp sgt i16 4, 5 ; yields: result=false
	<result> = icmp ule i16 -4, 5 ; yields: result=false
	<result> = icmp sge i16 4, 5 ; yields: result=false

	.. _i_fcmp:

	'``fcmp``' Instruction
	^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	<result> = fcmp [fast-math flags]* <cond> <ty> <op1>, <op2> ; yields i1 or <N x i1>:result

	Overview:
	"""""""""

	The '``fcmp``' instruction returns a boolean value or vector of boolean
	values based on comparison of its operands.

	If the operands are floating point scalars, then the result type is a
	boolean (:ref:`i1 <t_integer>`).

	If the operands are floating point vectors, then the result type is a
	vector of boolean with the same number of elements as the operands being
	compared.

	Arguments:
	""""""""""

	The '``fcmp``' instruction takes three operands. The first operand is
	the condition code indicating the kind of comparison to perform. It is
	not a value, just a keyword. The possible condition codes are:

	#. ``false``: no comparison, always returns false
	#. ``oeq``: ordered and equal
	#. ``ogt``: ordered and greater than
	#. ``oge``: ordered and greater than or equal
	#. ``olt``: ordered and less than
	#. ``ole``: ordered and less than or equal
	#. ``one``: ordered and not equal
	#. ``ord``: ordered (no nans)
	#. ``ueq``: unordered or equal
	#. ``ugt``: unordered or greater than
	#. ``uge``: unordered or greater than or equal
	#. ``ult``: unordered or less than
	#. ``ule``: unordered or less than or equal
	#. ``une``: unordered or not equal
	#. ``uno``: unordered (either nans)
	#. ``true``: no comparison, always returns true

	Ordered means that neither operand is a QNAN while unordered means
	that either operand may be a QNAN.

	Each of ``val1`` and ``val2`` arguments must be either a :ref:`floating
	point <t_floating>` type or a :ref:`vector <t_vector>` of floating point
	type. They must have identical types.

	Semantics:
	""""""""""

	The '``fcmp``' instruction compares ``op1`` and ``op2`` according to the
	condition code given as ``cond``. If the operands are vectors, then the
	vectors are compared element by element. Each comparison performed
	always yields an :ref:`i1 <t_integer>` result, as follows:

	#. ``false``: always yields ``false``, regardless of operands.
	#. ``oeq``: yields ``true`` if both operands are not a QNAN and ``op1``
	is equal to ``op2``.
	#. ``ogt``: yields ``true`` if both operands are not a QNAN and ``op1``
	is greater than ``op2``.
	#. ``oge``: yields ``true`` if both operands are not a QNAN and ``op1``
	is greater than or equal to ``op2``.
	#. ``olt``: yields ``true`` if both operands are not a QNAN and ``op1``
	is less than ``op2``.
	#. ``ole``: yields ``true`` if both operands are not a QNAN and ``op1``
	is less than or equal to ``op2``.
	#. ``one``: yields ``true`` if both operands are not a QNAN and ``op1``
	is not equal to ``op2``.
	#. ``ord``: yields ``true`` if both operands are not a QNAN.
	#. ``ueq``: yields ``true`` if either operand is a QNAN or ``op1`` is
	equal to ``op2``.
	#. ``ugt``: yields ``true`` if either operand is a QNAN or ``op1`` is
	greater than ``op2``.
	#. ``uge``: yields ``true`` if either operand is a QNAN or ``op1`` is
	greater than or equal to ``op2``.
	#. ``ult``: yields ``true`` if either operand is a QNAN or ``op1`` is
	less than ``op2``.
	#. ``ule``: yields ``true`` if either operand is a QNAN or ``op1`` is
	less than or equal to ``op2``.
	#. ``une``: yields ``true`` if either operand is a QNAN or ``op1`` is
	not equal to ``op2``.
	#. ``uno``: yields ``true`` if either operand is a QNAN.
	#. ``true``: always yields ``true``, regardless of operands.

	The ``fcmp`` instruction can also optionally take any number of
	:ref:`fast-math flags <fastmath>`, which are optimization hints to enable
	otherwise unsafe floating point optimizations.

	Any set of fast-math flags are legal on an ``fcmp`` instruction, but the
	only flags that have any effect on its semantics are those that allow
	assumptions to be made about the values of input arguments; namely
	``nnan``, ``ninf``, and ``nsz``. See :ref:`fastmath` for more information.

	Example:
	""""""""

	.. code-block:: text

	<result> = fcmp oeq float 4.0, 5.0 ; yields: result=false
	<result> = fcmp one float 4.0, 5.0 ; yields: result=true
	<result> = fcmp olt float 4.0, 5.0 ; yields: result=true
	<result> = fcmp ueq double 1.0, 2.0 ; yields: result=false

	.. _i_phi:

	'``phi``' Instruction
	^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	<result> = phi <ty> [ <val0>, <label0>], ...

	Overview:
	"""""""""

	The '``phi``' instruction is used to implement the φ node in the SSA
	graph representing the function.

	Arguments:
	""""""""""

	The type of the incoming values is specified with the first type field.
	After this, the '``phi``' instruction takes a list of pairs as
	arguments, with one pair for each predecessor basic block of the current
	block. Only values of :ref:`first class <t_firstclass>` type may be used as
	the value arguments to the PHI node. Only labels may be used as the
	label arguments.

	There must be no non-phi instructions between the start of a basic block
	and the PHI instructions: i.e. PHI instructions must be first in a basic
	block.

	For the purposes of the SSA form, the use of each incoming value is
	deemed to occur on the edge from the corresponding predecessor block to
	the current block (but after any definition of an '``invoke``'
	instruction's return value on the same edge).

	Semantics:
	""""""""""

	At runtime, the '``phi``' instruction logically takes on the value
	specified by the pair corresponding to the predecessor basic block that
	executed just prior to the current block.

	Example:
	""""""""

	.. code-block:: llvm

	Loop: ; Infinite loop that counts from 0 on up...
	%indvar = phi i32 [ 0, %LoopHeader ], [ %nextindvar, %Loop ]
	%nextindvar = add i32 %indvar, 1
	br label %Loop

	.. _i_select:

	'``select``' Instruction
	^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	<result> = select selty <cond>, <ty> <val1>, <ty> <val2> ; yields ty

	selty is either i1 or {<N x i1>}

	Overview:
	"""""""""

	The '``select``' instruction is used to choose one value based on a
	condition, without IR-level branching.

	Arguments:
	""""""""""

	The '``select``' instruction requires an 'i1' value or a vector of 'i1'
	values indicating the condition, and two values of the same :ref:`first
	class <t_firstclass>` type.

	Semantics:
	""""""""""

	If the condition is an i1 and it evaluates to 1, the instruction returns
	the first value argument; otherwise, it returns the second value
	argument.

	If the condition is a vector of i1, then the value arguments must be
	vectors of the same size, and the selection is done element by element.

	If the condition is an i1 and the value arguments are vectors of the
	same size, then an entire vector is selected.

	Example:
	""""""""

	.. code-block:: llvm

	%X = select i1 true, i8 17, i8 42 ; yields i8:17

	.. _i_call:

	'``call``' Instruction
	^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	<result> = [tail \| musttail \| notail ] call [fast-math flags] [cconv] [ret attrs] <ty>\|<fnty> <fnptrval>(<function args>) [fn attrs]
	[ operand bundles ]

	Overview:
	"""""""""

	The '``call``' instruction represents a simple function call.

	Arguments:
	""""""""""

	This instruction requires several arguments:

	#. The optional ``tail`` and ``musttail`` markers indicate that the optimizers
	should perform tail call optimization. The ``tail`` marker is a hint that
	`can be ignored <CodeGenerator.html#sibcallopt>`_. The ``musttail`` marker
	means that the call must be tail call optimized in order for the program to
	be correct. The ``musttail`` marker provides these guarantees:

	#. The call will not cause unbounded stack growth if it is part of a
	recursive cycle in the call graph.
	#. Arguments with the :ref:`inalloca <attr_inalloca>` attribute are
	forwarded in place.

	Both markers imply that the callee does not access allocas or varargs from
	the caller. Calls marked ``musttail`` must obey the following additional
	rules:

	- The call must immediately precede a :ref:`ret <i_ret>` instruction,
	or a pointer bitcast followed by a ret instruction.
	- The ret instruction must return the (possibly bitcasted) value
	produced by the call or void.
	- The caller and callee prototypes must match. Pointer types of
	parameters or return types may differ in pointee type, but not
	in address space.
	- The calling conventions of the caller and callee must match.
	- All ABI-impacting function attributes, such as sret, byval, inreg,
	returned, and inalloca, must match.
	- The callee must be varargs iff the caller is varargs. Bitcasting a
	non-varargs function to the appropriate varargs type is legal so
	long as the non-varargs prefixes obey the other rules.

	Tail call optimization for calls marked ``tail`` is guaranteed to occur if
	the following conditions are met:

	- Caller and callee both have the calling convention ``fastcc``.
	- The call is in tail position (ret immediately follows call and ret
	uses value of call or is void).
	- Option ``-tailcallopt`` is enabled, or
	``llvm::GuaranteedTailCallOpt`` is ``true``.
	- `Platform-specific constraints are
	met. <CodeGenerator.html#tailcallopt>`_

	#. The optional ``notail`` marker indicates that the optimizers should not add
	``tail`` or ``musttail`` markers to the call. It is used to prevent tail
	call optimization from being performed on the call.

	#. The optional ``fast-math flags`` marker indicates that the call has one or more
	:ref:`fast-math flags <fastmath>`, which are optimization hints to enable
	otherwise unsafe floating-point optimizations. Fast-math flags are only valid
	for calls that return a floating-point scalar or vector type.

	#. The optional "cconv" marker indicates which :ref:`calling
	convention <callingconv>` the call should use. If none is
	specified, the call defaults to using C calling conventions. The
	calling convention of the call must match the calling convention of
	the target function, or else the behavior is undefined.
	#. The optional :ref:`Parameter Attributes <paramattrs>` list for return
	values. Only '``zeroext``', '``signext``', and '``inreg``' attributes
	are valid here.
	#. '``ty``': the type of the call instruction itself which is also the
	type of the return value. Functions that return no value are marked
	``void``.
	#. '``fnty``': shall be the signature of the function being called. The
	argument types must match the types implied by this signature. This
	type can be omitted if the function is not varargs.
	#. '``fnptrval``': An LLVM value containing a pointer to a function to
	be called. In most cases, this is a direct function call, but
	indirect ``call``'s are just as possible, calling an arbitrary pointer
	to function value.
	#. '``function args``': argument list whose types match the function
	signature argument types and parameter attributes. All arguments must
	be of :ref:`first class <t_firstclass>` type. If the function signature
	indicates the function accepts a variable number of arguments, the
	extra arguments can be specified.
	#. The optional :ref:`function attributes <fnattrs>` list.
	#. The optional :ref:`operand bundles <opbundles>` list.

	Semantics:
	""""""""""

	The '``call``' instruction is used to cause control flow to transfer to
	a specified function, with its incoming arguments bound to the specified
	values. Upon a '``ret``' instruction in the called function, control
	flow continues with the instruction after the function call, and the
	return value of the function is bound to the result argument.

	Example:
	""""""""

	.. code-block:: llvm

	%retval = call i32 @test(i32 %argc)
	call i32 (i8, ...) @printf(i8* %msg, i32 12, i8 42) ; yields i32
	%X = tail call i32 @foo() ; yields i32
	%Y = tail call fastcc i32 @foo() ; yields i32
	call void %foo(i8 97 signext)

	%struct.A = type { i32, i8 }
	%r = call %struct.A @foo() ; yields { i32, i8 }
	%gr = extractvalue %struct.A %r, 0 ; yields i32
	%gr1 = extractvalue %struct.A %r, 1 ; yields i8
	%Z = call void @foo() noreturn ; indicates that %foo never returns normally
	%ZZ = call zeroext i32 @bar() ; Return value is %zero extended

	llvm treats calls to some functions with names and arguments that match
	the standard C99 library as being the C99 library functions, and may
	perform optimizations or generate code for them under that assumption.
	This is something we'd like to change in the future to provide better
	support for freestanding environments and non-C-based languages.

	.. _i_va_arg:

	'``va_arg``' Instruction
	^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	<resultval> = va_arg <va_list*> <arglist>, <argty>

	Overview:
	"""""""""

	The '``va_arg``' instruction is used to access arguments passed through
	the "variable argument" area of a function call. It is used to implement
	the ``va_arg`` macro in C.

	Arguments:
	""""""""""

	This instruction takes a ``va_list*`` value and the type of the
	argument. It returns a value of the specified argument type and
	increments the ``va_list`` to point to the next argument. The actual
	type of ``va_list`` is target specific.

	Semantics:
	""""""""""

	The '``va_arg``' instruction loads an argument of the specified type
	from the specified ``va_list`` and causes the ``va_list`` to point to
	the next argument. For more information, see the variable argument
	handling :ref:`Intrinsic Functions <int_varargs>`.

	It is legal for this instruction to be called in a function which does
	not take a variable number of arguments, for example, the ``vfprintf``
	function.

	``va_arg`` is an LLVM instruction instead of an :ref:`intrinsic
	function <intrinsics>` because it takes a type as an argument.

	Example:
	""""""""

	See the :ref:`variable argument processing <int_varargs>` section.

	Note that the code generator does not yet fully support va\_arg on many
	targets. Also, it does not currently support va\_arg with aggregate
	types on any target.

	.. _i_landingpad:

	'``landingpad``' Instruction
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	<resultval> = landingpad <resultty> <clause>+
	<resultval> = landingpad <resultty> cleanup <clause>*

	<clause> := catch <type> <value>
	<clause> := filter <array constant type> <array constant>

	Overview:
	"""""""""

	The '``landingpad``' instruction is used by `LLVM's exception handling
	system <ExceptionHandling.html#overview>`_ to specify that a basic block
	is a landing pad --- one where the exception lands, and corresponds to the
	code found in the ``catch`` portion of a ``try``/``catch`` sequence. It
	defines values supplied by the :ref:`personality function <personalityfn>` upon
	re-entry to the function. The ``resultval`` has the type ``resultty``.

	Arguments:
	""""""""""

	The optional
	``cleanup`` flag indicates that the landing pad block is a cleanup.

	A ``clause`` begins with the clause type --- ``catch`` or ``filter`` --- and
	contains the global variable representing the "type" that may be caught
	or filtered respectively. Unlike the ``catch`` clause, the ``filter``
	clause takes an array constant as its argument. Use
	"``[0 x i8**] undef``" for a filter which cannot throw. The
	'``landingpad``' instruction must contain at least one ``clause`` or
	the ``cleanup`` flag.

	Semantics:
	""""""""""

	The '``landingpad``' instruction defines the values which are set by the
	:ref:`personality function <personalityfn>` upon re-entry to the function, and
	therefore the "result type" of the ``landingpad`` instruction. As with
	calling conventions, how the personality function results are
	represented in LLVM IR is target specific.

	The clauses are applied in order from top to bottom. If two
	``landingpad`` instructions are merged together through inlining, the
	clauses from the calling function are appended to the list of clauses.
	When the call stack is being unwound due to an exception being thrown,
	the exception is compared against each ``clause`` in turn. If it doesn't
	match any of the clauses, and the ``cleanup`` flag is not set, then
	unwinding continues further up the call stack.

	The ``landingpad`` instruction has several restrictions:

	- A landing pad block is a basic block which is the unwind destination
	of an '``invoke``' instruction.
	- A landing pad block must have a '``landingpad``' instruction as its
	first non-PHI instruction.
	- There can be only one '``landingpad``' instruction within the landing
	pad block.
	- A basic block that is not a landing pad block may not include a
	'``landingpad``' instruction.

	Example:
	""""""""

	.. code-block:: llvm

	;; A landing pad which can catch an integer.
	%res = landingpad { i8*, i32 }
	catch i8** @_ZTIi
	;; A landing pad that is a cleanup.
	%res = landingpad { i8*, i32 }
	cleanup
	;; A landing pad which can catch an integer and can only throw a double.
	%res = landingpad { i8*, i32 }
	catch i8** @_ZTIi
	filter [1 x i8**] [@_ZTId]

	.. _i_catchpad:

	'``catchpad``' Instruction
	^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	<resultval> = catchpad within <catchswitch> [<args>*]

	Overview:
	"""""""""

	The '``catchpad``' instruction is used by `LLVM's exception handling
	system <ExceptionHandling.html#overview>`_ to specify that a basic block
	begins a catch handler --- one where a personality routine attempts to transfer
	control to catch an exception.

	Arguments:
	""""""""""

	The ``catchswitch`` operand must always be a token produced by a
	:ref:`catchswitch <i_catchswitch>` instruction in a predecessor block. This
	ensures that each ``catchpad`` has exactly one predecessor block, and it always
	terminates in a ``catchswitch``.

	The ``args`` correspond to whatever information the personality routine
	requires to know if this is an appropriate handler for the exception. Control
	will transfer to the ``catchpad`` if this is the first appropriate handler for
	the exception.

	The ``resultval`` has the type :ref:`token <t_token>` and is used to match the
	``catchpad`` to corresponding :ref:`catchrets <i_catchret>` and other nested EH
	pads.

	Semantics:
	""""""""""

	When the call stack is being unwound due to an exception being thrown, the
	exception is compared against the ``args``. If it doesn't match, control will
	not reach the ``catchpad`` instruction. The representation of ``args`` is
	entirely target and personality function-specific.

	Like the :ref:`landingpad <i_landingpad>` instruction, the ``catchpad``
	instruction must be the first non-phi of its parent basic block.

	The meaning of the tokens produced and consumed by ``catchpad`` and other "pad"
	instructions is described in the
	`Windows exception handling documentation\ <ExceptionHandling.html#wineh>`_.

	When a ``catchpad`` has been "entered" but not yet "exited" (as
	described in the `EH documentation\ <ExceptionHandling.html#wineh-constraints>`_),
	it is undefined behavior to execute a :ref:`call <i_call>` or :ref:`invoke <i_invoke>`
	that does not carry an appropriate :ref:`"funclet" bundle <ob_funclet>`.

	Example:
	""""""""

	.. code-block:: text

	dispatch:
	%cs = catchswitch within none [label %handler0] unwind to caller
	;; A catch block which can catch an integer.
	handler0:
	%tok = catchpad within %cs [i8** @_ZTIi]

	.. _i_cleanuppad:

	'``cleanuppad``' Instruction
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	<resultval> = cleanuppad within <parent> [<args>*]

	Overview:
	"""""""""

	The '``cleanuppad``' instruction is used by `LLVM's exception handling
	system <ExceptionHandling.html#overview>`_ to specify that a basic block
	is a cleanup block --- one where a personality routine attempts to
	transfer control to run cleanup actions.
	The ``args`` correspond to whatever additional
	information the :ref:`personality function <personalityfn>` requires to
	execute the cleanup.
	The ``resultval`` has the type :ref:`token <t_token>` and is used to
	match the ``cleanuppad`` to corresponding :ref:`cleanuprets <i_cleanupret>`.
	The ``parent`` argument is the token of the funclet that contains the
	``cleanuppad`` instruction. If the ``cleanuppad`` is not inside a funclet,
	this operand may be the token ``none``.

	Arguments:
	""""""""""

	The instruction takes a list of arbitrary values which are interpreted
	by the :ref:`personality function <personalityfn>`.

	Semantics:
	""""""""""

	When the call stack is being unwound due to an exception being thrown,
	the :ref:`personality function <personalityfn>` transfers control to the
	``cleanuppad`` with the aid of the personality-specific arguments.
	As with calling conventions, how the personality function results are
	represented in LLVM IR is target specific.

	The ``cleanuppad`` instruction has several restrictions:

	- A cleanup block is a basic block which is the unwind destination of
	an exceptional instruction.
	- A cleanup block must have a '``cleanuppad``' instruction as its
	first non-PHI instruction.
	- There can be only one '``cleanuppad``' instruction within the
	cleanup block.
	- A basic block that is not a cleanup block may not include a
	'``cleanuppad``' instruction.

	When a ``cleanuppad`` has been "entered" but not yet "exited" (as
	described in the `EH documentation\ <ExceptionHandling.html#wineh-constraints>`_),
	it is undefined behavior to execute a :ref:`call <i_call>` or :ref:`invoke <i_invoke>`
	that does not carry an appropriate :ref:`"funclet" bundle <ob_funclet>`.

	Example:
	""""""""

	.. code-block:: text

	%tok = cleanuppad within %cs []

	.. _intrinsics:

	Intrinsic Functions
	===================

	LLVM supports the notion of an "intrinsic function". These functions
	have well known names and semantics and are required to follow certain
	restrictions. Overall, these intrinsics represent an extension mechanism
	for the LLVM language that does not require changing all of the
	transformations in LLVM when adding to the language (or the bitcode
	reader/writer, the parser, etc...).

	Intrinsic function names must all start with an "``llvm.``" prefix. This
	prefix is reserved in LLVM for intrinsic names; thus, function names may
	not begin with this prefix. Intrinsic functions must always be external
	functions: you cannot define the body of intrinsic functions. Intrinsic
	functions may only be used in call or invoke instructions: it is illegal
	to take the address of an intrinsic function. Additionally, because
	intrinsic functions are part of the LLVM language, it is required if any
	are added that they be documented here.

	Some intrinsic functions can be overloaded, i.e., the intrinsic
	represents a family of functions that perform the same operation but on
	different data types. Because LLVM can represent over 8 million
	different integer types, overloading is used commonly to allow an
	intrinsic function to operate on any integer type. One or more of the
	argument types or the result type can be overloaded to accept any
	integer type. Argument types may also be defined as exactly matching a
	previous argument's type or the result type. This allows an intrinsic
	function which accepts multiple arguments, but needs all of them to be
	of the same type, to only be overloaded with respect to a single
	argument or the result.

	Overloaded intrinsics will have the names of its overloaded argument
	types encoded into its function name, each preceded by a period. Only
	those types which are overloaded result in a name suffix. Arguments
	whose type is matched against another type do not. For example, the
	``llvm.ctpop`` function can take an integer of any width and returns an
	integer of exactly the same integer width. This leads to a family of
	functions such as ``i8 @llvm.ctpop.i8(i8 %val)`` and
	``i29 @llvm.ctpop.i29(i29 %val)``. Only one type, the return type, is
	overloaded, and only one type suffix is required. Because the argument's
	type is matched against the return type, it does not require its own
	name suffix.

	To learn how to add an intrinsic function, please see the `Extending
	LLVM Guide <ExtendingLLVM.html>`_.

	.. _int_varargs:

	Variable Argument Handling Intrinsics
	-------------------------------------

	Variable argument support is defined in LLVM with the
	:ref:`va_arg <i_va_arg>` instruction and these three intrinsic
	functions. These functions are related to the similarly named macros
	defined in the ``<stdarg.h>`` header file.

	All of these functions operate on arguments that use a target-specific
	value type "``va_list``". The LLVM assembly language reference manual
	does not define what this type is, so all transformations should be
	prepared to handle these functions regardless of the type used.

	This example shows how the :ref:`va_arg <i_va_arg>` instruction and the
	variable argument handling intrinsic functions are used.

	.. code-block:: llvm

	; This struct is different for every platform. For most platforms,
	; it is merely an i8*.
	%struct.va_list = type { i8* }

	; For Unix x86_64 platforms, va_list is the following struct:
	; %struct.va_list = type { i32, i32, i8, i8 }

	define i32 @test(i32 %X, ...) {
	; Initialize variable argument processing
	%ap = alloca %struct.va_list
	%ap2 = bitcast %struct.va_list* %ap to i8*
	call void @llvm.va_start(i8* %ap2)

	; Read a single integer argument
	%tmp = va_arg i8* %ap2, i32

	; Demonstrate usage of llvm.va_copy and llvm.va_end
	%aq = alloca i8*
	%aq2 = bitcast i8** %aq to i8*
	call void @llvm.va_copy(i8* %aq2, i8* %ap2)
	call void @llvm.va_end(i8* %aq2)

	; Stop processing of arguments.
	call void @llvm.va_end(i8* %ap2)
	ret i32 %tmp
	}

	declare void @llvm.va_start(i8*)
	declare void @llvm.va_copy(i8, i8)
	declare void @llvm.va_end(i8*)

	.. _int_va_start:

	'``llvm.va_start``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare void @llvm.va_start(i8* <arglist>)

	Overview:
	"""""""""

	The '``llvm.va_start``' intrinsic initializes ``*<arglist>`` for
	subsequent use by ``va_arg``.

	Arguments:
	""""""""""

	The argument is a pointer to a ``va_list`` element to initialize.

	Semantics:
	""""""""""

	The '``llvm.va_start``' intrinsic works just like the ``va_start`` macro
	available in C. In a target-dependent way, it initializes the
	``va_list`` element to which the argument points, so that the next call
	to ``va_arg`` will produce the first variable argument passed to the
	function. Unlike the C ``va_start`` macro, this intrinsic does not need
	to know the last argument of the function as the compiler can figure
	that out.

	'``llvm.va_end``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare void @llvm.va_end(i8* <arglist>)

	Overview:
	"""""""""

	The '``llvm.va_end``' intrinsic destroys ``*<arglist>``, which has been
	initialized previously with ``llvm.va_start`` or ``llvm.va_copy``.

	Arguments:
	""""""""""

	The argument is a pointer to a ``va_list`` to destroy.

	Semantics:
	""""""""""

	The '``llvm.va_end``' intrinsic works just like the ``va_end`` macro
	available in C. In a target-dependent way, it destroys the ``va_list``
	element to which the argument points. Calls to
	:ref:`llvm.va_start <int_va_start>` and
	:ref:`llvm.va_copy <int_va_copy>` must be matched exactly with calls to
	``llvm.va_end``.

	.. _int_va_copy:

	'``llvm.va_copy``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare void @llvm.va_copy(i8* <destarglist>, i8* <srcarglist>)

	Overview:
	"""""""""

	The '``llvm.va_copy``' intrinsic copies the current argument position
	from the source argument list to the destination argument list.

	Arguments:
	""""""""""

	The first argument is a pointer to a ``va_list`` element to initialize.
	The second argument is a pointer to a ``va_list`` element to copy from.

	Semantics:
	""""""""""

	The '``llvm.va_copy``' intrinsic works just like the ``va_copy`` macro
	available in C. In a target-dependent way, it copies the source
	``va_list`` element into the destination ``va_list`` element. This
	intrinsic is necessary because the `` llvm.va_start`` intrinsic may be
	arbitrarily complex and require, for example, memory allocation.

	Accurate Garbage Collection Intrinsics
	--------------------------------------

	LLVM's support for `Accurate Garbage Collection <GarbageCollection.html>`_
	(GC) requires the frontend to generate code containing appropriate intrinsic
	calls and select an appropriate GC strategy which knows how to lower these
	intrinsics in a manner which is appropriate for the target collector.

	These intrinsics allow identification of :ref:`GC roots on the
	stack <int_gcroot>`, as well as garbage collector implementations that
	require :ref:`read <int_gcread>` and :ref:`write <int_gcwrite>` barriers.
	Frontends for type-safe garbage collected languages should generate
	these intrinsics to make use of the LLVM garbage collectors. For more
	details, see `Garbage Collection with LLVM <GarbageCollection.html>`_.

	Experimental Statepoint Intrinsics
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	LLVM provides an second experimental set of intrinsics for describing garbage
	collection safepoints in compiled code. These intrinsics are an alternative
	to the ``llvm.gcroot`` intrinsics, but are compatible with the ones for
	:ref:`read <int_gcread>` and :ref:`write <int_gcwrite>` barriers. The
	differences in approach are covered in the `Garbage Collection with LLVM
	<GarbageCollection.html>`_ documentation. The intrinsics themselves are
	described in :doc:`Statepoints`.

	.. _int_gcroot:

	'``llvm.gcroot``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare void @llvm.gcroot(i8** %ptrloc, i8* %metadata)

	Overview:
	"""""""""

	The '``llvm.gcroot``' intrinsic declares the existence of a GC root to
	the code generator, and allows some metadata to be associated with it.

	Arguments:
	""""""""""

	The first argument specifies the address of a stack object that contains
	the root pointer. The second pointer (which must be either a constant or
	a global value address) contains the meta-data to be associated with the
	root.

	Semantics:
	""""""""""

	At runtime, a call to this intrinsic stores a null pointer into the
	"ptrloc" location. At compile-time, the code generator generates
	information to allow the runtime to find the pointer at GC safe points.
	The '``llvm.gcroot``' intrinsic may only be used in a function which
	:ref:`specifies a GC algorithm <gc>`.

	.. _int_gcread:

	'``llvm.gcread``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare i8* @llvm.gcread(i8* %ObjPtr, i8** %Ptr)

	Overview:
	"""""""""

	The '``llvm.gcread``' intrinsic identifies reads of references from heap
	locations, allowing garbage collector implementations that require read
	barriers.

	Arguments:
	""""""""""

	The second argument is the address to read from, which should be an
	address allocated from the garbage collector. The first object is a
	pointer to the start of the referenced object, if needed by the language
	runtime (otherwise null).

	Semantics:
	""""""""""

	The '``llvm.gcread``' intrinsic has the same semantics as a load
	instruction, but may be replaced with substantially more complex code by
	the garbage collector runtime, as needed. The '``llvm.gcread``'
	intrinsic may only be used in a function which :ref:`specifies a GC
	algorithm <gc>`.

	.. _int_gcwrite:

	'``llvm.gcwrite``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare void @llvm.gcwrite(i8* %P1, i8* %Obj, i8** %P2)

	Overview:
	"""""""""

	The '``llvm.gcwrite``' intrinsic identifies writes of references to heap
	locations, allowing garbage collector implementations that require write
	barriers (such as generational or reference counting collectors).

	Arguments:
	""""""""""

	The first argument is the reference to store, the second is the start of
	the object to store it to, and the third is the address of the field of
	Obj to store to. If the runtime does not require a pointer to the
	object, Obj may be null.

	Semantics:
	""""""""""

	The '``llvm.gcwrite``' intrinsic has the same semantics as a store
	instruction, but may be replaced with substantially more complex code by
	the garbage collector runtime, as needed. The '``llvm.gcwrite``'
	intrinsic may only be used in a function which :ref:`specifies a GC
	algorithm <gc>`.

	Code Generator Intrinsics
	-------------------------

	These intrinsics are provided by LLVM to expose special features that
	may only be implemented with code generator support.

	'``llvm.returnaddress``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare i8* @llvm.returnaddress(i32 <level>)

	Overview:
	"""""""""

	The '``llvm.returnaddress``' intrinsic attempts to compute a
	target-specific value indicating the return address of the current
	function or one of its callers.

	Arguments:
	""""""""""

	The argument to this intrinsic indicates which function to return the
	address for. Zero indicates the calling function, one indicates its
	caller, etc. The argument is required to be a constant integer
	value.

	Semantics:
	""""""""""

	The '``llvm.returnaddress``' intrinsic either returns a pointer
	indicating the return address of the specified call frame, or zero if it
	cannot be identified. The value returned by this intrinsic is likely to
	be incorrect or 0 for arguments other than zero, so it should only be
	used for debugging purposes.

	Note that calling this intrinsic does not prevent function inlining or
	other aggressive transformations, so the value returned may not be that
	of the obvious source-language caller.

	'``llvm.addressofreturnaddress``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare i8* @llvm.addressofreturnaddress()

	Overview:
	"""""""""

	The '``llvm.addressofreturnaddress``' intrinsic returns a target-specific
	pointer to the place in the stack frame where the return address of the
	current function is stored.

	Semantics:
	""""""""""

	Note that calling this intrinsic does not prevent function inlining or
	other aggressive transformations, so the value returned may not be that
	of the obvious source-language caller.

	This intrinsic is only implemented for x86.

	'``llvm.frameaddress``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare i8* @llvm.frameaddress(i32 <level>)

	Overview:
	"""""""""

	The '``llvm.frameaddress``' intrinsic attempts to return the
	target-specific frame pointer value for the specified stack frame.

	Arguments:
	""""""""""

	The argument to this intrinsic indicates which function to return the
	frame pointer for. Zero indicates the calling function, one indicates
	its caller, etc. The argument is required to be a constant integer
	value.

	Semantics:
	""""""""""

	The '``llvm.frameaddress``' intrinsic either returns a pointer
	indicating the frame address of the specified call frame, or zero if it
	cannot be identified. The value returned by this intrinsic is likely to
	be incorrect or 0 for arguments other than zero, so it should only be
	used for debugging purposes.

	Note that calling this intrinsic does not prevent function inlining or
	other aggressive transformations, so the value returned may not be that
	of the obvious source-language caller.

	'``llvm.localescape``' and '``llvm.localrecover``' Intrinsics
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare void @llvm.localescape(...)
	declare i8* @llvm.localrecover(i8* %func, i8* %fp, i32 %idx)

	Overview:
	"""""""""

	The '``llvm.localescape``' intrinsic escapes offsets of a collection of static
	allocas, and the '``llvm.localrecover``' intrinsic applies those offsets to a
	live frame pointer to recover the address of the allocation. The offset is
	computed during frame layout of the caller of ``llvm.localescape``.

	Arguments:
	""""""""""

	All arguments to '``llvm.localescape``' must be pointers to static allocas or
	casts of static allocas. Each function can only call '``llvm.localescape``'
	once, and it can only do so from the entry block.

	The ``func`` argument to '``llvm.localrecover``' must be a constant
	bitcasted pointer to a function defined in the current module. The code
	generator cannot determine the frame allocation offset of functions defined in
	other modules.

	The ``fp`` argument to '``llvm.localrecover``' must be a frame pointer of a
	call frame that is currently live. The return value of '``llvm.localaddress``'
	is one way to produce such a value, but various runtimes also expose a suitable
	pointer in platform-specific ways.

	The ``idx`` argument to '``llvm.localrecover``' indicates which alloca passed to
	'``llvm.localescape``' to recover. It is zero-indexed.

	Semantics:
	""""""""""

	These intrinsics allow a group of functions to share access to a set of local
	stack allocations of a one parent function. The parent function may call the
	'``llvm.localescape``' intrinsic once from the function entry block, and the
	child functions can use '``llvm.localrecover``' to access the escaped allocas.
	The '``llvm.localescape``' intrinsic blocks inlining, as inlining changes where
	the escaped allocas are allocated, which would break attempts to use
	'``llvm.localrecover``'.

	.. _int_read_register:
	.. _int_write_register:

	'``llvm.read_register``' and '``llvm.write_register``' Intrinsics
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare i32 @llvm.read_register.i32(metadata)
	declare i64 @llvm.read_register.i64(metadata)
	declare void @llvm.write_register.i32(metadata, i32 @value)
	declare void @llvm.write_register.i64(metadata, i64 @value)
	!0 = !{!"sp\00"}

	Overview:
	"""""""""

	The '``llvm.read_register``' and '``llvm.write_register``' intrinsics
	provides access to the named register. The register must be valid on
	the architecture being compiled to. The type needs to be compatible
	with the register being read.

	Semantics:
	""""""""""

	The '``llvm.read_register``' intrinsic returns the current value of the
	register, where possible. The '``llvm.write_register``' intrinsic sets
	the current value of the register, where possible.

	This is useful to implement named register global variables that need
	to always be mapped to a specific register, as is common practice on
	bare-metal programs including OS kernels.

	The compiler doesn't check for register availability or use of the used
	register in surrounding code, including inline assembly. Because of that,
	allocatable registers are not supported.

	Warning: So far it only works with the stack pointer on selected
	architectures (ARM, AArch64, PowerPC and x86_64). Significant amount of
	work is needed to support other registers and even more so, allocatable
	registers.

	.. _int_stacksave:

	'``llvm.stacksave``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare i8* @llvm.stacksave()

	Overview:
	"""""""""

	The '``llvm.stacksave``' intrinsic is used to remember the current state
	of the function stack, for use with
	:ref:`llvm.stackrestore <int_stackrestore>`. This is useful for
	implementing language features like scoped automatic variable sized
	arrays in C99.

	Semantics:
	""""""""""

	This intrinsic returns a opaque pointer value that can be passed to
	:ref:`llvm.stackrestore <int_stackrestore>`. When an
	``llvm.stackrestore`` intrinsic is executed with a value saved from
	``llvm.stacksave``, it effectively restores the state of the stack to
	the state it was in when the ``llvm.stacksave`` intrinsic executed. In
	practice, this pops any :ref:`alloca <i_alloca>` blocks from the stack that
	were allocated after the ``llvm.stacksave`` was executed.

	.. _int_stackrestore:

	'``llvm.stackrestore``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare void @llvm.stackrestore(i8* %ptr)

	Overview:
	"""""""""

	The '``llvm.stackrestore``' intrinsic is used to restore the state of
	the function stack to the state it was in when the corresponding
	:ref:`llvm.stacksave <int_stacksave>` intrinsic executed. This is
	useful for implementing language features like scoped automatic variable
	sized arrays in C99.

	Semantics:
	""""""""""

	See the description for :ref:`llvm.stacksave <int_stacksave>`.

	.. _int_get_dynamic_area_offset:

	'``llvm.get.dynamic.area.offset``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare i32 @llvm.get.dynamic.area.offset.i32()
	declare i64 @llvm.get.dynamic.area.offset.i64()

	Overview:
	"""""""""

	The '``llvm.get.dynamic.area.offset.*``' intrinsic family is used to
	get the offset from native stack pointer to the address of the most
	recent dynamic alloca on the caller's stack. These intrinsics are
	intendend for use in combination with
	:ref:`llvm.stacksave <int_stacksave>` to get a
	pointer to the most recent dynamic alloca. This is useful, for example,
	for AddressSanitizer's stack unpoisoning routines.

	Semantics:
	""""""""""

	These intrinsics return a non-negative integer value that can be used to
	get the address of the most recent dynamic alloca, allocated by :ref:`alloca <i_alloca>`
	on the caller's stack. In particular, for targets where stack grows downwards,
	adding this offset to the native stack pointer would get the address of the most
	recent dynamic alloca. For targets where stack grows upwards, the situation is a bit more
	complicated, because subtracting this value from stack pointer would get the address
	one past the end of the most recent dynamic alloca.

	Although for most targets `llvm.get.dynamic.area.offset <int_get_dynamic_area_offset>`
	returns just a zero, for others, such as PowerPC and PowerPC64, it returns a
	compile-time-known constant value.

	The return value type of :ref:`llvm.get.dynamic.area.offset <int_get_dynamic_area_offset>`
	must match the target's default address space's (address space 0) pointer type.

	'``llvm.prefetch``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare void @llvm.prefetch(i8* <address>, i32 <rw>, i32 <locality>, i32 <cache type>)

	Overview:
	"""""""""

	The '``llvm.prefetch``' intrinsic is a hint to the code generator to
	insert a prefetch instruction if supported; otherwise, it is a noop.
	Prefetches have no effect on the behavior of the program but can change
	its performance characteristics.

	Arguments:
	""""""""""

	``address`` is the address to be prefetched, ``rw`` is the specifier
	determining if the fetch should be for a read (0) or write (1), and
	``locality`` is a temporal locality specifier ranging from (0) - no
	locality, to (3) - extremely local keep in cache. The ``cache type``
	specifies whether the prefetch is performed on the data (1) or
	instruction (0) cache. The ``rw``, ``locality`` and ``cache type``
	arguments must be constant integers.

	Semantics:
	""""""""""

	This intrinsic does not modify the behavior of the program. In
	particular, prefetches cannot trap and do not produce a value. On
	targets that support this intrinsic, the prefetch can provide hints to
	the processor cache for better performance.

	'``llvm.pcmarker``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare void @llvm.pcmarker(i32 <id>)

	Overview:
	"""""""""

	The '``llvm.pcmarker``' intrinsic is a method to export a Program
	Counter (PC) in a region of code to simulators and other tools. The
	method is target specific, but it is expected that the marker will use
	exported symbols to transmit the PC of the marker. The marker makes no
	guarantees that it will remain with any specific instruction after
	optimizations. It is possible that the presence of a marker will inhibit
	optimizations. The intended use is to be inserted after optimizations to
	allow correlations of simulation runs.

	Arguments:
	""""""""""

	``id`` is a numerical id identifying the marker.

	Semantics:
	""""""""""

	This intrinsic does not modify the behavior of the program. Backends
	that do not support this intrinsic may ignore it.

	'``llvm.readcyclecounter``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare i64 @llvm.readcyclecounter()

	Overview:
	"""""""""

	The '``llvm.readcyclecounter``' intrinsic provides access to the cycle
	counter register (or similar low latency, high accuracy clocks) on those
	targets that support it. On X86, it should map to RDTSC. On Alpha, it
	should map to RPCC. As the backing counters overflow quickly (on the
	order of 9 seconds on alpha), this should only be used for small
	timings.

	Semantics:
	""""""""""

	When directly supported, reading the cycle counter should not modify any
	memory. Implementations are allowed to either return a application
	specific value or a system wide value. On backends without support, this
	is lowered to a constant 0.

	Note that runtime support may be conditional on the privilege-level code is
	running at and the host platform.

	'``llvm.clear_cache``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare void @llvm.clear_cache(i8, i8)

	Overview:
	"""""""""

	The '``llvm.clear_cache``' intrinsic ensures visibility of modifications
	in the specified range to the execution unit of the processor. On
	targets with non-unified instruction and data cache, the implementation
	flushes the instruction cache.

	Semantics:
	""""""""""

	On platforms with coherent instruction and data caches (e.g. x86), this
	intrinsic is a nop. On platforms with non-coherent instruction and data
	cache (e.g. ARM, MIPS), the intrinsic is lowered either to appropriate
	instructions or a system call, if cache flushing requires special
	privileges.

	The default behavior is to emit a call to ``__clear_cache`` from the run
	time library.

	This instrinsic does not empty the instruction pipeline. Modifications
	of the current function are outside the scope of the intrinsic.

	'``llvm.instrprof_increment``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare void @llvm.instrprof_increment(i8* <name>, i64 <hash>,
	i32 <num-counters>, i32 <index>)

	Overview:
	"""""""""

	The '``llvm.instrprof_increment``' intrinsic can be emitted by a
	frontend for use with instrumentation based profiling. These will be
	lowered by the ``-instrprof`` pass to generate execution counts of a
	program at runtime.

	Arguments:
	""""""""""

	The first argument is a pointer to a global variable containing the
	name of the entity being instrumented. This should generally be the
	(mangled) function name for a set of counters.

	The second argument is a hash value that can be used by the consumer
	of the profile data to detect changes to the instrumented source, and
	the third is the number of counters associated with ``name``. It is an
	error if ``hash`` or ``num-counters`` differ between two instances of
	``instrprof_increment`` that refer to the same name.

	The last argument refers to which of the counters for ``name`` should
	be incremented. It should be a value between 0 and ``num-counters``.

	Semantics:
	""""""""""

	This intrinsic represents an increment of a profiling counter. It will
	cause the ``-instrprof`` pass to generate the appropriate data
	structures and the code to increment the appropriate value, in a
	format that can be written out by a compiler runtime and consumed via
	the ``llvm-profdata`` tool.

	'``llvm.instrprof_increment_step``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare void @llvm.instrprof_increment_step(i8* <name>, i64 <hash>,
	i32 <num-counters>,
	i32 <index>, i64 <step>)

	Overview:
	"""""""""

	The '``llvm.instrprof_increment_step``' intrinsic is an extension to
	the '``llvm.instrprof_increment``' intrinsic with an additional fifth
	argument to specify the step of the increment.

	Arguments:
	""""""""""
	The first four arguments are the same as '``llvm.instrprof_increment``'
	instrinsic.

	The last argument specifies the value of the increment of the counter variable.

	Semantics:
	""""""""""
	See description of '``llvm.instrprof_increment``' instrinsic.


	'``llvm.instrprof_value_profile``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare void @llvm.instrprof_value_profile(i8* <name>, i64 <hash>,
	i64 <value>, i32 <value_kind>,
	i32 <index>)

	Overview:
	"""""""""

	The '``llvm.instrprof_value_profile``' intrinsic can be emitted by a
	frontend for use with instrumentation based profiling. This will be
	lowered by the ``-instrprof`` pass to find out the target values,
	instrumented expressions take in a program at runtime.

	Arguments:
	""""""""""

	The first argument is a pointer to a global variable containing the
	name of the entity being instrumented. ``name`` should generally be the
	(mangled) function name for a set of counters.

	The second argument is a hash value that can be used by the consumer
	of the profile data to detect changes to the instrumented source. It
	is an error if ``hash`` differs between two instances of
	``llvm.instrprof_*`` that refer to the same name.

	The third argument is the value of the expression being profiled. The profiled
	expression's value should be representable as an unsigned 64-bit value. The
	fourth argument represents the kind of value profiling that is being done. The
	supported value profiling kinds are enumerated through the
	``InstrProfValueKind`` type declared in the
	``<include/llvm/ProfileData/InstrProf.h>`` header file. The last argument is the
	index of the instrumented expression within ``name``. It should be >= 0.

	Semantics:
	""""""""""

	This intrinsic represents the point where a call to a runtime routine
	should be inserted for value profiling of target expressions. ``-instrprof``
	pass will generate the appropriate data structures and replace the
	``llvm.instrprof_value_profile`` intrinsic with the call to the profile
	runtime library with proper arguments.

	'``llvm.thread.pointer``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare i8* @llvm.thread.pointer()

	Overview:
	"""""""""

	The '``llvm.thread.pointer``' intrinsic returns the value of the thread
	pointer.

	Semantics:
	""""""""""

	The '``llvm.thread.pointer``' intrinsic returns a pointer to the TLS area
	for the current thread. The exact semantics of this value are target
	specific: it may point to the start of TLS area, to the end, or somewhere
	in the middle. Depending on the target, this intrinsic may read a register,
	call a helper function, read from an alternate memory space, or perform
	other operations necessary to locate the TLS area. Not all targets support
	this intrinsic.

	Standard C Library Intrinsics
	-----------------------------

	LLVM provides intrinsics for a few important standard C library
	functions. These intrinsics allow source-language front-ends to pass
	information about the alignment of the pointer arguments to the code
	generator, providing opportunity for more efficient code generation.

	.. _int_memcpy:

	'``llvm.memcpy``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	This is an overloaded intrinsic. You can use ``llvm.memcpy`` on any
	integer bit width and for different address spaces. Not all targets
	support all bit widths however.

	::

	declare void @llvm.memcpy.p0i8.p0i8.i32(i8* <dest>, i8* <src>,
	i32 <len>, i32 <align>, i1 <isvolatile>)
	declare void @llvm.memcpy.p0i8.p0i8.i64(i8* <dest>, i8* <src>,
	i64 <len>, i32 <align>, i1 <isvolatile>)

	Overview:
	"""""""""

	The '``llvm.memcpy.*``' intrinsics copy a block of memory from the
	source location to the destination location.

	Note that, unlike the standard libc function, the ``llvm.memcpy.*``
	intrinsics do not return a value, takes extra alignment/isvolatile
	arguments and the pointers can be in specified address spaces.

	Arguments:
	""""""""""

	The first argument is a pointer to the destination, the second is a
	pointer to the source. The third argument is an integer argument
	specifying the number of bytes to copy, the fourth argument is the
	alignment of the source and destination locations, and the fifth is a
	boolean indicating a volatile access.

	If the call to this intrinsic has an alignment value that is not 0 or 1,
	then the caller guarantees that both the source and destination pointers
	are aligned to that boundary.

	If the ``isvolatile`` parameter is ``true``, the ``llvm.memcpy`` call is
	a :ref:`volatile operation <volatile>`. The detailed access behavior is not
	very cleanly specified and it is unwise to depend on it.

	Semantics:
	""""""""""

	The '``llvm.memcpy.*``' intrinsics copy a block of memory from the
	source location to the destination location, which are not allowed to
	overlap. It copies "len" bytes of memory over. If the argument is known
	to be aligned to some boundary, this can be specified as the fourth
	argument, otherwise it should be set to 0 or 1 (both meaning no alignment).

	.. _int_memmove:

	'``llvm.memmove``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	This is an overloaded intrinsic. You can use llvm.memmove on any integer
	bit width and for different address space. Not all targets support all
	bit widths however.

	::

	declare void @llvm.memmove.p0i8.p0i8.i32(i8* <dest>, i8* <src>,
	i32 <len>, i32 <align>, i1 <isvolatile>)
	declare void @llvm.memmove.p0i8.p0i8.i64(i8* <dest>, i8* <src>,
	i64 <len>, i32 <align>, i1 <isvolatile>)

	Overview:
	"""""""""

	The '``llvm.memmove.*``' intrinsics move a block of memory from the
	source location to the destination location. It is similar to the
	'``llvm.memcpy``' intrinsic but allows the two memory locations to
	overlap.

	Note that, unlike the standard libc function, the ``llvm.memmove.*``
	intrinsics do not return a value, takes extra alignment/isvolatile
	arguments and the pointers can be in specified address spaces.

	Arguments:
	""""""""""

	The first argument is a pointer to the destination, the second is a
	pointer to the source. The third argument is an integer argument
	specifying the number of bytes to copy, the fourth argument is the
	alignment of the source and destination locations, and the fifth is a
	boolean indicating a volatile access.

	If the call to this intrinsic has an alignment value that is not 0 or 1,
	then the caller guarantees that the source and destination pointers are
	aligned to that boundary.

	If the ``isvolatile`` parameter is ``true``, the ``llvm.memmove`` call
	is a :ref:`volatile operation <volatile>`. The detailed access behavior is
	not very cleanly specified and it is unwise to depend on it.

	Semantics:
	""""""""""

	The '``llvm.memmove.*``' intrinsics copy a block of memory from the
	source location to the destination location, which may overlap. It
	copies "len" bytes of memory over. If the argument is known to be
	aligned to some boundary, this can be specified as the fourth argument,
	otherwise it should be set to 0 or 1 (both meaning no alignment).

	.. _int_memset:

	'``llvm.memset.*``' Intrinsics
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	This is an overloaded intrinsic. You can use llvm.memset on any integer
	bit width and for different address spaces. However, not all targets
	support all bit widths.

	::

	declare void @llvm.memset.p0i8.i32(i8* <dest>, i8 <val>,
	i32 <len>, i32 <align>, i1 <isvolatile>)
	declare void @llvm.memset.p0i8.i64(i8* <dest>, i8 <val>,
	i64 <len>, i32 <align>, i1 <isvolatile>)

	Overview:
	"""""""""

	The '``llvm.memset.*``' intrinsics fill a block of memory with a
	particular byte value.

	Note that, unlike the standard libc function, the ``llvm.memset``
	intrinsic does not return a value and takes extra alignment/volatile
	arguments. Also, the destination can be in an arbitrary address space.

	Arguments:
	""""""""""

	The first argument is a pointer to the destination to fill, the second
	is the byte value with which to fill it, the third argument is an
	integer argument specifying the number of bytes to fill, and the fourth
	argument is the known alignment of the destination location.

	If the call to this intrinsic has an alignment value that is not 0 or 1,
	then the caller guarantees that the destination pointer is aligned to
	that boundary.

	If the ``isvolatile`` parameter is ``true``, the ``llvm.memset`` call is
	a :ref:`volatile operation <volatile>`. The detailed access behavior is not
	very cleanly specified and it is unwise to depend on it.

	Semantics:
	""""""""""

	The '``llvm.memset.*``' intrinsics fill "len" bytes of memory starting
	at the destination location. If the argument is known to be aligned to
	some boundary, this can be specified as the fourth argument, otherwise
	it should be set to 0 or 1 (both meaning no alignment).

	'``llvm.sqrt.*``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	This is an overloaded intrinsic. You can use ``llvm.sqrt`` on any
	floating point or vector of floating point type. Not all targets support
	all types however.

	::

	declare float @llvm.sqrt.f32(float %Val)
	declare double @llvm.sqrt.f64(double %Val)
	declare x86_fp80 @llvm.sqrt.f80(x86_fp80 %Val)
	declare fp128 @llvm.sqrt.f128(fp128 %Val)
	declare ppc_fp128 @llvm.sqrt.ppcf128(ppc_fp128 %Val)

	Overview:
	"""""""""

	The '``llvm.sqrt``' intrinsics return the square root of the specified value,
	returning the same value as the libm '``sqrt``' functions would, but without
	trapping or setting ``errno``.

	Arguments:
	""""""""""

	The argument and return value are floating point numbers of the same type.

	Semantics:
	""""""""""

	This function returns the square root of the operand if it is a nonnegative
	floating point number.

	'``llvm.powi.*``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	This is an overloaded intrinsic. You can use ``llvm.powi`` on any
	floating point or vector of floating point type. Not all targets support
	all types however.

	::

	declare float @llvm.powi.f32(float %Val, i32 %power)
	declare double @llvm.powi.f64(double %Val, i32 %power)
	declare x86_fp80 @llvm.powi.f80(x86_fp80 %Val, i32 %power)
	declare fp128 @llvm.powi.f128(fp128 %Val, i32 %power)
	declare ppc_fp128 @llvm.powi.ppcf128(ppc_fp128 %Val, i32 %power)

	Overview:
	"""""""""

	The '``llvm.powi.*``' intrinsics return the first operand raised to the
	specified (positive or negative) power. The order of evaluation of
	multiplications is not defined. When a vector of floating point type is
	used, the second argument remains a scalar integer value.

	Arguments:
	""""""""""

	The second argument is an integer power, and the first is a value to
	raise to that power.

	Semantics:
	""""""""""

	This function returns the first value raised to the second power with an
	unspecified sequence of rounding operations.

	'``llvm.sin.*``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	This is an overloaded intrinsic. You can use ``llvm.sin`` on any
	floating point or vector of floating point type. Not all targets support
	all types however.

	::

	declare float @llvm.sin.f32(float %Val)
	declare double @llvm.sin.f64(double %Val)
	declare x86_fp80 @llvm.sin.f80(x86_fp80 %Val)
	declare fp128 @llvm.sin.f128(fp128 %Val)
	declare ppc_fp128 @llvm.sin.ppcf128(ppc_fp128 %Val)

	Overview:
	"""""""""

	The '``llvm.sin.*``' intrinsics return the sine of the operand.

	Arguments:
	""""""""""

	The argument and return value are floating point numbers of the same type.

	Semantics:
	""""""""""

	This function returns the sine of the specified operand, returning the
	same values as the libm ``sin`` functions would, and handles error
	conditions in the same way.

	'``llvm.cos.*``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	This is an overloaded intrinsic. You can use ``llvm.cos`` on any
	floating point or vector of floating point type. Not all targets support
	all types however.

	::

	declare float @llvm.cos.f32(float %Val)
	declare double @llvm.cos.f64(double %Val)
	declare x86_fp80 @llvm.cos.f80(x86_fp80 %Val)
	declare fp128 @llvm.cos.f128(fp128 %Val)
	declare ppc_fp128 @llvm.cos.ppcf128(ppc_fp128 %Val)

	Overview:
	"""""""""

	The '``llvm.cos.*``' intrinsics return the cosine of the operand.

	Arguments:
	""""""""""

	The argument and return value are floating point numbers of the same type.

	Semantics:
	""""""""""

	This function returns the cosine of the specified operand, returning the
	same values as the libm ``cos`` functions would, and handles error
	conditions in the same way.

	'``llvm.pow.*``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	This is an overloaded intrinsic. You can use ``llvm.pow`` on any
	floating point or vector of floating point type. Not all targets support
	all types however.

	::

	declare float @llvm.pow.f32(float %Val, float %Power)
	declare double @llvm.pow.f64(double %Val, double %Power)
	declare x86_fp80 @llvm.pow.f80(x86_fp80 %Val, x86_fp80 %Power)
	declare fp128 @llvm.pow.f128(fp128 %Val, fp128 %Power)
	declare ppc_fp128 @llvm.pow.ppcf128(ppc_fp128 %Val, ppc_fp128 Power)

	Overview:
	"""""""""

	The '``llvm.pow.*``' intrinsics return the first operand raised to the
	specified (positive or negative) power.

	Arguments:
	""""""""""

	The second argument is a floating point power, and the first is a value
	to raise to that power.

	Semantics:
	""""""""""

	This function returns the first value raised to the second power,
	returning the same values as the libm ``pow`` functions would, and
	handles error conditions in the same way.

	'``llvm.exp.*``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	This is an overloaded intrinsic. You can use ``llvm.exp`` on any
	floating point or vector of floating point type. Not all targets support
	all types however.

	::

	declare float @llvm.exp.f32(float %Val)
	declare double @llvm.exp.f64(double %Val)
	declare x86_fp80 @llvm.exp.f80(x86_fp80 %Val)
	declare fp128 @llvm.exp.f128(fp128 %Val)
	declare ppc_fp128 @llvm.exp.ppcf128(ppc_fp128 %Val)

	Overview:
	"""""""""

	The '``llvm.exp.*``' intrinsics compute the base-e exponential of the specified
	value.

	Arguments:
	""""""""""

	The argument and return value are floating point numbers of the same type.

	Semantics:
	""""""""""

	This function returns the same values as the libm ``exp`` functions
	would, and handles error conditions in the same way.

	'``llvm.exp2.*``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	This is an overloaded intrinsic. You can use ``llvm.exp2`` on any
	floating point or vector of floating point type. Not all targets support
	all types however.

	::

	declare float @llvm.exp2.f32(float %Val)
	declare double @llvm.exp2.f64(double %Val)
	declare x86_fp80 @llvm.exp2.f80(x86_fp80 %Val)
	declare fp128 @llvm.exp2.f128(fp128 %Val)
	declare ppc_fp128 @llvm.exp2.ppcf128(ppc_fp128 %Val)

	Overview:
	"""""""""

	The '``llvm.exp2.*``' intrinsics compute the base-2 exponential of the
	specified value.

	Arguments:
	""""""""""

	The argument and return value are floating point numbers of the same type.

	Semantics:
	""""""""""

	This function returns the same values as the libm ``exp2`` functions
	would, and handles error conditions in the same way.

	'``llvm.log.*``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	This is an overloaded intrinsic. You can use ``llvm.log`` on any
	floating point or vector of floating point type. Not all targets support
	all types however.

	::

	declare float @llvm.log.f32(float %Val)
	declare double @llvm.log.f64(double %Val)
	declare x86_fp80 @llvm.log.f80(x86_fp80 %Val)
	declare fp128 @llvm.log.f128(fp128 %Val)
	declare ppc_fp128 @llvm.log.ppcf128(ppc_fp128 %Val)

	Overview:
	"""""""""

	The '``llvm.log.*``' intrinsics compute the base-e logarithm of the specified
	value.

	Arguments:
	""""""""""

	The argument and return value are floating point numbers of the same type.

	Semantics:
	""""""""""

	This function returns the same values as the libm ``log`` functions
	would, and handles error conditions in the same way.

	'``llvm.log10.*``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	This is an overloaded intrinsic. You can use ``llvm.log10`` on any
	floating point or vector of floating point type. Not all targets support
	all types however.

	::

	declare float @llvm.log10.f32(float %Val)
	declare double @llvm.log10.f64(double %Val)
	declare x86_fp80 @llvm.log10.f80(x86_fp80 %Val)
	declare fp128 @llvm.log10.f128(fp128 %Val)
	declare ppc_fp128 @llvm.log10.ppcf128(ppc_fp128 %Val)

	Overview:
	"""""""""

	The '``llvm.log10.*``' intrinsics compute the base-10 logarithm of the
	specified value.

	Arguments:
	""""""""""

	The argument and return value are floating point numbers of the same type.

	Semantics:
	""""""""""

	This function returns the same values as the libm ``log10`` functions
	would, and handles error conditions in the same way.

	'``llvm.log2.*``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	This is an overloaded intrinsic. You can use ``llvm.log2`` on any
	floating point or vector of floating point type. Not all targets support
	all types however.

	::

	declare float @llvm.log2.f32(float %Val)
	declare double @llvm.log2.f64(double %Val)
	declare x86_fp80 @llvm.log2.f80(x86_fp80 %Val)
	declare fp128 @llvm.log2.f128(fp128 %Val)
	declare ppc_fp128 @llvm.log2.ppcf128(ppc_fp128 %Val)

	Overview:
	"""""""""

	The '``llvm.log2.*``' intrinsics compute the base-2 logarithm of the specified
	value.

	Arguments:
	""""""""""

	The argument and return value are floating point numbers of the same type.

	Semantics:
	""""""""""

	This function returns the same values as the libm ``log2`` functions
	would, and handles error conditions in the same way.

	'``llvm.fma.*``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	This is an overloaded intrinsic. You can use ``llvm.fma`` on any
	floating point or vector of floating point type. Not all targets support
	all types however.

	::

	declare float @llvm.fma.f32(float %a, float %b, float %c)
	declare double @llvm.fma.f64(double %a, double %b, double %c)
	declare x86_fp80 @llvm.fma.f80(x86_fp80 %a, x86_fp80 %b, x86_fp80 %c)
	declare fp128 @llvm.fma.f128(fp128 %a, fp128 %b, fp128 %c)
	declare ppc_fp128 @llvm.fma.ppcf128(ppc_fp128 %a, ppc_fp128 %b, ppc_fp128 %c)

	Overview:
	"""""""""

	The '``llvm.fma.*``' intrinsics perform the fused multiply-add
	operation.

	Arguments:
	""""""""""

	The argument and return value are floating point numbers of the same
	type.

	Semantics:
	""""""""""

	This function returns the same values as the libm ``fma`` functions
	would, and does not set errno.

	'``llvm.fabs.*``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	This is an overloaded intrinsic. You can use ``llvm.fabs`` on any
	floating point or vector of floating point type. Not all targets support
	all types however.

	::

	declare float @llvm.fabs.f32(float %Val)
	declare double @llvm.fabs.f64(double %Val)
	declare x86_fp80 @llvm.fabs.f80(x86_fp80 %Val)
	declare fp128 @llvm.fabs.f128(fp128 %Val)
	declare ppc_fp128 @llvm.fabs.ppcf128(ppc_fp128 %Val)

	Overview:
	"""""""""

	The '``llvm.fabs.*``' intrinsics return the absolute value of the
	operand.

	Arguments:
	""""""""""

	The argument and return value are floating point numbers of the same
	type.

	Semantics:
	""""""""""

	This function returns the same values as the libm ``fabs`` functions
	would, and handles error conditions in the same way.

	'``llvm.minnum.*``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	This is an overloaded intrinsic. You can use ``llvm.minnum`` on any
	floating point or vector of floating point type. Not all targets support
	all types however.

	::

	declare float @llvm.minnum.f32(float %Val0, float %Val1)
	declare double @llvm.minnum.f64(double %Val0, double %Val1)
	declare x86_fp80 @llvm.minnum.f80(x86_fp80 %Val0, x86_fp80 %Val1)
	declare fp128 @llvm.minnum.f128(fp128 %Val0, fp128 %Val1)
	declare ppc_fp128 @llvm.minnum.ppcf128(ppc_fp128 %Val0, ppc_fp128 %Val1)

	Overview:
	"""""""""

	The '``llvm.minnum.*``' intrinsics return the minimum of the two
	arguments.


	Arguments:
	""""""""""

	The arguments and return value are floating point numbers of the same
	type.

	Semantics:
	""""""""""

	Follows the IEEE-754 semantics for minNum, which also match for libm's
	fmin.

	If either operand is a NaN, returns the other non-NaN operand. Returns
	NaN only if both operands are NaN. If the operands compare equal,
	returns a value that compares equal to both operands. This means that
	fmin(+/-0.0, +/-0.0) could return either -0.0 or 0.0.

	'``llvm.maxnum.*``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	This is an overloaded intrinsic. You can use ``llvm.maxnum`` on any
	floating point or vector of floating point type. Not all targets support
	all types however.

	::

	declare float @llvm.maxnum.f32(float %Val0, float %Val1l)
	declare double @llvm.maxnum.f64(double %Val0, double %Val1)
	declare x86_fp80 @llvm.maxnum.f80(x86_fp80 %Val0, x86_fp80 %Val1)
	declare fp128 @llvm.maxnum.f128(fp128 %Val0, fp128 %Val1)
	declare ppc_fp128 @llvm.maxnum.ppcf128(ppc_fp128 %Val0, ppc_fp128 %Val1)

	Overview:
	"""""""""

	The '``llvm.maxnum.*``' intrinsics return the maximum of the two
	arguments.


	Arguments:
	""""""""""

	The arguments and return value are floating point numbers of the same
	type.

	Semantics:
	""""""""""
	Follows the IEEE-754 semantics for maxNum, which also match for libm's
	fmax.

	If either operand is a NaN, returns the other non-NaN operand. Returns
	NaN only if both operands are NaN. If the operands compare equal,
	returns a value that compares equal to both operands. This means that
	fmax(+/-0.0, +/-0.0) could return either -0.0 or 0.0.

	'``llvm.copysign.*``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	This is an overloaded intrinsic. You can use ``llvm.copysign`` on any
	floating point or vector of floating point type. Not all targets support
	all types however.

	::

	declare float @llvm.copysign.f32(float %Mag, float %Sgn)
	declare double @llvm.copysign.f64(double %Mag, double %Sgn)
	declare x86_fp80 @llvm.copysign.f80(x86_fp80 %Mag, x86_fp80 %Sgn)
	declare fp128 @llvm.copysign.f128(fp128 %Mag, fp128 %Sgn)
	declare ppc_fp128 @llvm.copysign.ppcf128(ppc_fp128 %Mag, ppc_fp128 %Sgn)

	Overview:
	"""""""""

	The '``llvm.copysign.*``' intrinsics return a value with the magnitude of the
	first operand and the sign of the second operand.

	Arguments:
	""""""""""

	The arguments and return value are floating point numbers of the same
	type.

	Semantics:
	""""""""""

	This function returns the same values as the libm ``copysign``
	functions would, and handles error conditions in the same way.

	'``llvm.floor.*``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	This is an overloaded intrinsic. You can use ``llvm.floor`` on any
	floating point or vector of floating point type. Not all targets support
	all types however.

	::

	declare float @llvm.floor.f32(float %Val)
	declare double @llvm.floor.f64(double %Val)
	declare x86_fp80 @llvm.floor.f80(x86_fp80 %Val)
	declare fp128 @llvm.floor.f128(fp128 %Val)
	declare ppc_fp128 @llvm.floor.ppcf128(ppc_fp128 %Val)

	Overview:
	"""""""""

	The '``llvm.floor.*``' intrinsics return the floor of the operand.

	Arguments:
	""""""""""

	The argument and return value are floating point numbers of the same
	type.

	Semantics:
	""""""""""

	This function returns the same values as the libm ``floor`` functions
	would, and handles error conditions in the same way.

	'``llvm.ceil.*``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	This is an overloaded intrinsic. You can use ``llvm.ceil`` on any
	floating point or vector of floating point type. Not all targets support
	all types however.

	::

	declare float @llvm.ceil.f32(float %Val)
	declare double @llvm.ceil.f64(double %Val)
	declare x86_fp80 @llvm.ceil.f80(x86_fp80 %Val)
	declare fp128 @llvm.ceil.f128(fp128 %Val)
	declare ppc_fp128 @llvm.ceil.ppcf128(ppc_fp128 %Val)

	Overview:
	"""""""""

	The '``llvm.ceil.*``' intrinsics return the ceiling of the operand.

	Arguments:
	""""""""""

	The argument and return value are floating point numbers of the same
	type.

	Semantics:
	""""""""""

	This function returns the same values as the libm ``ceil`` functions
	would, and handles error conditions in the same way.

	'``llvm.trunc.*``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	This is an overloaded intrinsic. You can use ``llvm.trunc`` on any
	floating point or vector of floating point type. Not all targets support
	all types however.

	::

	declare float @llvm.trunc.f32(float %Val)
	declare double @llvm.trunc.f64(double %Val)
	declare x86_fp80 @llvm.trunc.f80(x86_fp80 %Val)
	declare fp128 @llvm.trunc.f128(fp128 %Val)
	declare ppc_fp128 @llvm.trunc.ppcf128(ppc_fp128 %Val)

	Overview:
	"""""""""

	The '``llvm.trunc.*``' intrinsics returns the operand rounded to the
	nearest integer not larger in magnitude than the operand.

	Arguments:
	""""""""""

	The argument and return value are floating point numbers of the same
	type.

	Semantics:
	""""""""""

	This function returns the same values as the libm ``trunc`` functions
	would, and handles error conditions in the same way.

	'``llvm.rint.*``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	This is an overloaded intrinsic. You can use ``llvm.rint`` on any
	floating point or vector of floating point type. Not all targets support
	all types however.

	::

	declare float @llvm.rint.f32(float %Val)
	declare double @llvm.rint.f64(double %Val)
	declare x86_fp80 @llvm.rint.f80(x86_fp80 %Val)
	declare fp128 @llvm.rint.f128(fp128 %Val)
	declare ppc_fp128 @llvm.rint.ppcf128(ppc_fp128 %Val)

	Overview:
	"""""""""

	The '``llvm.rint.*``' intrinsics returns the operand rounded to the
	nearest integer. It may raise an inexact floating-point exception if the
	operand isn't an integer.

	Arguments:
	""""""""""

	The argument and return value are floating point numbers of the same
	type.

	Semantics:
	""""""""""

	This function returns the same values as the libm ``rint`` functions
	would, and handles error conditions in the same way.

	'``llvm.nearbyint.*``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	This is an overloaded intrinsic. You can use ``llvm.nearbyint`` on any
	floating point or vector of floating point type. Not all targets support
	all types however.

	::

	declare float @llvm.nearbyint.f32(float %Val)
	declare double @llvm.nearbyint.f64(double %Val)
	declare x86_fp80 @llvm.nearbyint.f80(x86_fp80 %Val)
	declare fp128 @llvm.nearbyint.f128(fp128 %Val)
	declare ppc_fp128 @llvm.nearbyint.ppcf128(ppc_fp128 %Val)

	Overview:
	"""""""""

	The '``llvm.nearbyint.*``' intrinsics returns the operand rounded to the
	nearest integer.

	Arguments:
	""""""""""

	The argument and return value are floating point numbers of the same
	type.

	Semantics:
	""""""""""

	This function returns the same values as the libm ``nearbyint``
	functions would, and handles error conditions in the same way.

	'``llvm.round.*``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	This is an overloaded intrinsic. You can use ``llvm.round`` on any
	floating point or vector of floating point type. Not all targets support
	all types however.

	::

	declare float @llvm.round.f32(float %Val)
	declare double @llvm.round.f64(double %Val)
	declare x86_fp80 @llvm.round.f80(x86_fp80 %Val)
	declare fp128 @llvm.round.f128(fp128 %Val)
	declare ppc_fp128 @llvm.round.ppcf128(ppc_fp128 %Val)

	Overview:
	"""""""""

	The '``llvm.round.*``' intrinsics returns the operand rounded to the
	nearest integer.

	Arguments:
	""""""""""

	The argument and return value are floating point numbers of the same
	type.

	Semantics:
	""""""""""

	This function returns the same values as the libm ``round``
	functions would, and handles error conditions in the same way.

	Bit Manipulation Intrinsics
	---------------------------

	LLVM provides intrinsics for a few important bit manipulation
	operations. These allow efficient code generation for some algorithms.

	'``llvm.bitreverse.*``' Intrinsics
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	This is an overloaded intrinsic function. You can use bitreverse on any
	integer type.

	::

	declare i16 @llvm.bitreverse.i16(i16 <id>)
	declare i32 @llvm.bitreverse.i32(i32 <id>)
	declare i64 @llvm.bitreverse.i64(i64 <id>)

	Overview:
	"""""""""

	The '``llvm.bitreverse``' family of intrinsics is used to reverse the
	bitpattern of an integer value; for example ``0b10110110`` becomes
	``0b01101101``.

	Semantics:
	""""""""""

	The ``llvm.bitreverse.iN`` intrinsic returns an iN value that has bit
	``M`` in the input moved to bit ``N-M`` in the output.

	'``llvm.bswap.*``' Intrinsics
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	This is an overloaded intrinsic function. You can use bswap on any
	integer type that is an even number of bytes (i.e. BitWidth % 16 == 0).

	::

	declare i16 @llvm.bswap.i16(i16 <id>)
	declare i32 @llvm.bswap.i32(i32 <id>)
	declare i64 @llvm.bswap.i64(i64 <id>)

	Overview:
	"""""""""

	The '``llvm.bswap``' family of intrinsics is used to byte swap integer
	values with an even number of bytes (positive multiple of 16 bits).
	These are useful for performing operations on data that is not in the
	target's native byte order.

	Semantics:
	""""""""""

	The ``llvm.bswap.i16`` intrinsic returns an i16 value that has the high
	and low byte of the input i16 swapped. Similarly, the ``llvm.bswap.i32``
	intrinsic returns an i32 value that has the four bytes of the input i32
	swapped, so that if the input bytes are numbered 0, 1, 2, 3 then the
	returned i32 will have its bytes in 3, 2, 1, 0 order. The
	``llvm.bswap.i48``, ``llvm.bswap.i64`` and other intrinsics extend this
	concept to additional even-byte lengths (6 bytes, 8 bytes and more,
	respectively).

	'``llvm.ctpop.*``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	This is an overloaded intrinsic. You can use llvm.ctpop on any integer
	bit width, or on any vector with integer elements. Not all targets
	support all bit widths or vector types, however.

	::

	declare i8 @llvm.ctpop.i8(i8 <src>)
	declare i16 @llvm.ctpop.i16(i16 <src>)
	declare i32 @llvm.ctpop.i32(i32 <src>)
	declare i64 @llvm.ctpop.i64(i64 <src>)
	declare i256 @llvm.ctpop.i256(i256 <src>)
	declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32> <src>)

	Overview:
	"""""""""

	The '``llvm.ctpop``' family of intrinsics counts the number of bits set
	in a value.

	Arguments:
	""""""""""

	The only argument is the value to be counted. The argument may be of any
	integer type, or a vector with integer elements. The return type must
	match the argument type.

	Semantics:
	""""""""""

	The '``llvm.ctpop``' intrinsic counts the 1's in a variable, or within
	each element of a vector.

	'``llvm.ctlz.*``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	This is an overloaded intrinsic. You can use ``llvm.ctlz`` on any
	integer bit width, or any vector whose elements are integers. Not all
	targets support all bit widths or vector types, however.

	::

	declare i8 @llvm.ctlz.i8 (i8 <src>, i1 <is_zero_undef>)
	declare i16 @llvm.ctlz.i16 (i16 <src>, i1 <is_zero_undef>)
	declare i32 @llvm.ctlz.i32 (i32 <src>, i1 <is_zero_undef>)
	declare i64 @llvm.ctlz.i64 (i64 <src>, i1 <is_zero_undef>)
	declare i256 @llvm.ctlz.i256(i256 <src>, i1 <is_zero_undef>)
	declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32> <src>, i1 <is_zero_undef>)

	Overview:
	"""""""""

	The '``llvm.ctlz``' family of intrinsic functions counts the number of
	leading zeros in a variable.

	Arguments:
	""""""""""

	The first argument is the value to be counted. This argument may be of
	any integer type, or a vector with integer element type. The return
	type must match the first argument type.

	The second argument must be a constant and is a flag to indicate whether
	the intrinsic should ensure that a zero as the first argument produces a
	defined result. Historically some architectures did not provide a
	defined result for zero values as efficiently, and many algorithms are
	now predicated on avoiding zero-value inputs.

	Semantics:
	""""""""""

	The '``llvm.ctlz``' intrinsic counts the leading (most significant)
	zeros in a variable, or within each element of the vector. If
	``src == 0`` then the result is the size in bits of the type of ``src``
	if ``is_zero_undef == 0`` and ``undef`` otherwise. For example,
	``llvm.ctlz(i32 2) = 30``.

	'``llvm.cttz.*``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	This is an overloaded intrinsic. You can use ``llvm.cttz`` on any
	integer bit width, or any vector of integer elements. Not all targets
	support all bit widths or vector types, however.

	::

	declare i8 @llvm.cttz.i8 (i8 <src>, i1 <is_zero_undef>)
	declare i16 @llvm.cttz.i16 (i16 <src>, i1 <is_zero_undef>)
	declare i32 @llvm.cttz.i32 (i32 <src>, i1 <is_zero_undef>)
	declare i64 @llvm.cttz.i64 (i64 <src>, i1 <is_zero_undef>)
	declare i256 @llvm.cttz.i256(i256 <src>, i1 <is_zero_undef>)
	declare <2 x i32> @llvm.cttz.v2i32(<2 x i32> <src>, i1 <is_zero_undef>)

	Overview:
	"""""""""

	The '``llvm.cttz``' family of intrinsic functions counts the number of
	trailing zeros.

	Arguments:
	""""""""""

	The first argument is the value to be counted. This argument may be of
	any integer type, or a vector with integer element type. The return
	type must match the first argument type.

	The second argument must be a constant and is a flag to indicate whether
	the intrinsic should ensure that a zero as the first argument produces a
	defined result. Historically some architectures did not provide a
	defined result for zero values as efficiently, and many algorithms are
	now predicated on avoiding zero-value inputs.

	Semantics:
	""""""""""

	The '``llvm.cttz``' intrinsic counts the trailing (least significant)
	zeros in a variable, or within each element of a vector. If ``src == 0``
	then the result is the size in bits of the type of ``src`` if
	``is_zero_undef == 0`` and ``undef`` otherwise. For example,
	``llvm.cttz(2) = 1``.

	.. _int_overflow:

	Arithmetic with Overflow Intrinsics
	-----------------------------------

	LLVM provides intrinsics for fast arithmetic overflow checking.

	Each of these intrinsics returns a two-element struct. The first
	element of this struct contains the result of the corresponding
	arithmetic operation modulo 2\ :sup:`n`\ , where n is the bit width of
	the result. Therefore, for example, the first element of the struct
	returned by ``llvm.sadd.with.overflow.i32`` is always the same as the
	result of a 32-bit ``add`` instruction with the same operands, where
	the ``add`` is not modified by an ``nsw`` or ``nuw`` flag.

	The second element of the result is an ``i1`` that is 1 if the
	arithmetic operation overflowed and 0 otherwise. An operation
	overflows if, for any values of its operands ``A`` and ``B`` and for
	any ``N`` larger than the operands' width, ``ext(A op B) to iN`` is
	not equal to ``(ext(A) to iN) op (ext(B) to iN)`` where ``ext`` is
	``sext`` for signed overflow and ``zext`` for unsigned overflow, and
	``op`` is the underlying arithmetic operation.

	The behavior of these intrinsics is well-defined for all argument
	values.

	'``llvm.sadd.with.overflow.*``' Intrinsics
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	This is an overloaded intrinsic. You can use ``llvm.sadd.with.overflow``
	on any integer bit width.

	::

	declare {i16, i1} @llvm.sadd.with.overflow.i16(i16 %a, i16 %b)
	declare {i32, i1} @llvm.sadd.with.overflow.i32(i32 %a, i32 %b)
	declare {i64, i1} @llvm.sadd.with.overflow.i64(i64 %a, i64 %b)

	Overview:
	"""""""""

	The '``llvm.sadd.with.overflow``' family of intrinsic functions perform
	a signed addition of the two arguments, and indicate whether an overflow
	occurred during the signed summation.

	Arguments:
	""""""""""

	The arguments (%a and %b) and the first element of the result structure
	may be of integer types of any bit width, but they must have the same
	bit width. The second element of the result structure must be of type
	``i1``. ``%a`` and ``%b`` are the two values that will undergo signed
	addition.

	Semantics:
	""""""""""

	The '``llvm.sadd.with.overflow``' family of intrinsic functions perform
	a signed addition of the two variables. They return a structure --- the
	first element of which is the signed summation, and the second element
	of which is a bit specifying if the signed summation resulted in an
	overflow.

	Examples:
	"""""""""

	.. code-block:: llvm

	%res = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %a, i32 %b)
	%sum = extractvalue {i32, i1} %res, 0
	%obit = extractvalue {i32, i1} %res, 1
	br i1 %obit, label %overflow, label %normal

	'``llvm.uadd.with.overflow.*``' Intrinsics
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	This is an overloaded intrinsic. You can use ``llvm.uadd.with.overflow``
	on any integer bit width.

	::

	declare {i16, i1} @llvm.uadd.with.overflow.i16(i16 %a, i16 %b)
	declare {i32, i1} @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
	declare {i64, i1} @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)

	Overview:
	"""""""""

	The '``llvm.uadd.with.overflow``' family of intrinsic functions perform
	an unsigned addition of the two arguments, and indicate whether a carry
	occurred during the unsigned summation.

	Arguments:
	""""""""""

	The arguments (%a and %b) and the first element of the result structure
	may be of integer types of any bit width, but they must have the same
	bit width. The second element of the result structure must be of type
	``i1``. ``%a`` and ``%b`` are the two values that will undergo unsigned
	addition.

	Semantics:
	""""""""""

	The '``llvm.uadd.with.overflow``' family of intrinsic functions perform
	an unsigned addition of the two arguments. They return a structure --- the
	first element of which is the sum, and the second element of which is a
	bit specifying if the unsigned summation resulted in a carry.

	Examples:
	"""""""""

	.. code-block:: llvm

	%res = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
	%sum = extractvalue {i32, i1} %res, 0
	%obit = extractvalue {i32, i1} %res, 1
	br i1 %obit, label %carry, label %normal

	'``llvm.ssub.with.overflow.*``' Intrinsics
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	This is an overloaded intrinsic. You can use ``llvm.ssub.with.overflow``
	on any integer bit width.

	::

	declare {i16, i1} @llvm.ssub.with.overflow.i16(i16 %a, i16 %b)
	declare {i32, i1} @llvm.ssub.with.overflow.i32(i32 %a, i32 %b)
	declare {i64, i1} @llvm.ssub.with.overflow.i64(i64 %a, i64 %b)

	Overview:
	"""""""""

	The '``llvm.ssub.with.overflow``' family of intrinsic functions perform
	a signed subtraction of the two arguments, and indicate whether an
	overflow occurred during the signed subtraction.

	Arguments:
	""""""""""

	The arguments (%a and %b) and the first element of the result structure
	may be of integer types of any bit width, but they must have the same
	bit width. The second element of the result structure must be of type
	``i1``. ``%a`` and ``%b`` are the two values that will undergo signed
	subtraction.

	Semantics:
	""""""""""

	The '``llvm.ssub.with.overflow``' family of intrinsic functions perform
	a signed subtraction of the two arguments. They return a structure --- the
	first element of which is the subtraction, and the second element of
	which is a bit specifying if the signed subtraction resulted in an
	overflow.

	Examples:
	"""""""""

	.. code-block:: llvm

	%res = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %a, i32 %b)
	%sum = extractvalue {i32, i1} %res, 0
	%obit = extractvalue {i32, i1} %res, 1
	br i1 %obit, label %overflow, label %normal

	'``llvm.usub.with.overflow.*``' Intrinsics
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	This is an overloaded intrinsic. You can use ``llvm.usub.with.overflow``
	on any integer bit width.

	::

	declare {i16, i1} @llvm.usub.with.overflow.i16(i16 %a, i16 %b)
	declare {i32, i1} @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
	declare {i64, i1} @llvm.usub.with.overflow.i64(i64 %a, i64 %b)

	Overview:
	"""""""""

	The '``llvm.usub.with.overflow``' family of intrinsic functions perform
	an unsigned subtraction of the two arguments, and indicate whether an
	overflow occurred during the unsigned subtraction.

	Arguments:
	""""""""""

	The arguments (%a and %b) and the first element of the result structure
	may be of integer types of any bit width, but they must have the same
	bit width. The second element of the result structure must be of type
	``i1``. ``%a`` and ``%b`` are the two values that will undergo unsigned
	subtraction.

	Semantics:
	""""""""""

	The '``llvm.usub.with.overflow``' family of intrinsic functions perform
	an unsigned subtraction of the two arguments. They return a structure ---
	the first element of which is the subtraction, and the second element of
	which is a bit specifying if the unsigned subtraction resulted in an
	overflow.

	Examples:
	"""""""""

	.. code-block:: llvm

	%res = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
	%sum = extractvalue {i32, i1} %res, 0
	%obit = extractvalue {i32, i1} %res, 1
	br i1 %obit, label %overflow, label %normal

	'``llvm.smul.with.overflow.*``' Intrinsics
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	This is an overloaded intrinsic. You can use ``llvm.smul.with.overflow``
	on any integer bit width.

	::

	declare {i16, i1} @llvm.smul.with.overflow.i16(i16 %a, i16 %b)
	declare {i32, i1} @llvm.smul.with.overflow.i32(i32 %a, i32 %b)
	declare {i64, i1} @llvm.smul.with.overflow.i64(i64 %a, i64 %b)

	Overview:
	"""""""""

	The '``llvm.smul.with.overflow``' family of intrinsic functions perform
	a signed multiplication of the two arguments, and indicate whether an
	overflow occurred during the signed multiplication.

	Arguments:
	""""""""""

	The arguments (%a and %b) and the first element of the result structure
	may be of integer types of any bit width, but they must have the same
	bit width. The second element of the result structure must be of type
	``i1``. ``%a`` and ``%b`` are the two values that will undergo signed
	multiplication.

	Semantics:
	""""""""""

	The '``llvm.smul.with.overflow``' family of intrinsic functions perform
	a signed multiplication of the two arguments. They return a structure ---
	the first element of which is the multiplication, and the second element
	of which is a bit specifying if the signed multiplication resulted in an
	overflow.

	Examples:
	"""""""""

	.. code-block:: llvm

	%res = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %a, i32 %b)
	%sum = extractvalue {i32, i1} %res, 0
	%obit = extractvalue {i32, i1} %res, 1
	br i1 %obit, label %overflow, label %normal

	'``llvm.umul.with.overflow.*``' Intrinsics
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	This is an overloaded intrinsic. You can use ``llvm.umul.with.overflow``
	on any integer bit width.

	::

	declare {i16, i1} @llvm.umul.with.overflow.i16(i16 %a, i16 %b)
	declare {i32, i1} @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
	declare {i64, i1} @llvm.umul.with.overflow.i64(i64 %a, i64 %b)

	Overview:
	"""""""""

	The '``llvm.umul.with.overflow``' family of intrinsic functions perform
	a unsigned multiplication of the two arguments, and indicate whether an
	overflow occurred during the unsigned multiplication.

	Arguments:
	""""""""""

	The arguments (%a and %b) and the first element of the result structure
	may be of integer types of any bit width, but they must have the same
	bit width. The second element of the result structure must be of type
	``i1``. ``%a`` and ``%b`` are the two values that will undergo unsigned
	multiplication.

	Semantics:
	""""""""""

	The '``llvm.umul.with.overflow``' family of intrinsic functions perform
	an unsigned multiplication of the two arguments. They return a structure ---
	the first element of which is the multiplication, and the second
	element of which is a bit specifying if the unsigned multiplication
	resulted in an overflow.

	Examples:
	"""""""""

	.. code-block:: llvm

	%res = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
	%sum = extractvalue {i32, i1} %res, 0
	%obit = extractvalue {i32, i1} %res, 1
	br i1 %obit, label %overflow, label %normal

	Specialised Arithmetic Intrinsics
	---------------------------------

	'``llvm.canonicalize.*``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare float @llvm.canonicalize.f32(float %a)
	declare double @llvm.canonicalize.f64(double %b)

	Overview:
	"""""""""

	The '``llvm.canonicalize.*``' intrinsic returns the platform specific canonical
	encoding of a floating point number. This canonicalization is useful for
	implementing certain numeric primitives such as frexp. The canonical encoding is
	defined by IEEE-754-2008 to be:

	::

	2.1.8 canonical encoding: The preferred encoding of a floating-point
	representation in a format. Applied to declets, significands of finite
	numbers, infinities, and NaNs, especially in decimal formats.

	This operation can also be considered equivalent to the IEEE-754-2008
	conversion of a floating-point value to the same format. NaNs are handled
	according to section 6.2.

	Examples of non-canonical encodings:

	- x87 pseudo denormals, pseudo NaNs, pseudo Infinity, Unnormals. These are
	converted to a canonical representation per hardware-specific protocol.
	- Many normal decimal floating point numbers have non-canonical alternative
	encodings.
	- Some machines, like GPUs or ARMv7 NEON, do not support subnormal values.
	These are treated as non-canonical encodings of zero and will be flushed to
	a zero of the same sign by this operation.

	Note that per IEEE-754-2008 6.2, systems that support signaling NaNs with
	default exception handling must signal an invalid exception, and produce a
	quiet NaN result.

	This function should always be implementable as multiplication by 1.0, provided
	that the compiler does not constant fold the operation. Likewise, division by
	1.0 and ``llvm.minnum(x, x)`` are possible implementations. Addition with
	-0.0 is also sufficient provided that the rounding mode is not -Infinity.

	``@llvm.canonicalize`` must preserve the equality relation. That is:

	- ``(@llvm.canonicalize(x) == x)`` is equivalent to ``(x == x)``
	- ``(@llvm.canonicalize(x) == @llvm.canonicalize(y))`` is equivalent to
	to ``(x == y)``

	Additionally, the sign of zero must be conserved:
	``@llvm.canonicalize(-0.0) = -0.0`` and ``@llvm.canonicalize(+0.0) = +0.0``

	The payload bits of a NaN must be conserved, with two exceptions.
	First, environments which use only a single canonical representation of NaN
	must perform said canonicalization. Second, SNaNs must be quieted per the
	usual methods.

	The canonicalization operation may be optimized away if:

	- The input is known to be canonical. For example, it was produced by a
	floating-point operation that is required by the standard to be canonical.
	- The result is consumed only by (or fused with) other floating-point
	operations. That is, the bits of the floating point value are not examined.

	'``llvm.fmuladd.*``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare float @llvm.fmuladd.f32(float %a, float %b, float %c)
	declare double @llvm.fmuladd.f64(double %a, double %b, double %c)

	Overview:
	"""""""""

	The '``llvm.fmuladd.*``' intrinsic functions represent multiply-add
	expressions that can be fused if the code generator determines that (a) the
	target instruction set has support for a fused operation, and (b) that the
	fused operation is more efficient than the equivalent, separate pair of mul
	and add instructions.

	Arguments:
	""""""""""

	The '``llvm.fmuladd.*``' intrinsics each take three arguments: two
	multiplicands, a and b, and an addend c.

	Semantics:
	""""""""""

	The expression:

	::

	%0 = call float @llvm.fmuladd.f32(%a, %b, %c)

	is equivalent to the expression a \* b + c, except that rounding will
	not be performed between the multiplication and addition steps if the
	code generator fuses the operations. Fusion is not guaranteed, even if
	the target platform supports it. If a fused multiply-add is required the
	corresponding llvm.fma.\* intrinsic function should be used
	instead. This never sets errno, just as '``llvm.fma.*``'.

	Examples:
	"""""""""

	.. code-block:: llvm

	%r2 = call float @llvm.fmuladd.f32(float %a, float %b, float %c) ; yields float:r2 = (a * b) + c


	Experimental Vector Reduction Intrinsics
	----------------------------------------

	Horizontal reductions of vectors can be expressed using the following
	intrinsics. Each one takes a vector operand as an input and applies its
	respective operation across all elements of the vector, returning a single
	scalar result of the same element type.


	'``llvm.experimental.vector.reduce.add.*``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> %a)
	declare i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> %a)

	Overview:
	"""""""""

	The '``llvm.experimental.vector.reduce.add.*``' intrinsics do an integer ``ADD``
	reduction of a vector, returning the result as a scalar. The return type matches
	the element-type of the vector input.

	Arguments:
	""""""""""
	The argument to this intrinsic must be a vector of integer values.

	'``llvm.experimental.vector.reduce.fadd.*``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %acc, <4 x float> %a)
	declare double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double %acc, <2 x double> %a)

	Overview:
	"""""""""

	The '``llvm.experimental.vector.reduce.fadd.*``' intrinsics do a floating point
	``ADD`` reduction of a vector, returning the result as a scalar. The return type
	matches the element-type of the vector input.

	If the intrinsic call has fast-math flags, then the reduction will not preserve
	the associativity of an equivalent scalarized counterpart. If it does not have
	fast-math flags, then the reduction will be ordered, implying that the
	operation respects the associativity of a scalarized reduction.


	Arguments:
	""""""""""
	The first argument to this intrinsic is a scalar accumulator value, which is
	only used when there are no fast-math flags attached. This argument may be undef
	when fast-math flags are used.

	The second argument must be a vector of floating point values.

	Examples:
	"""""""""

	.. code-block:: llvm

	%fast = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %input) ; fast reduction
	%ord = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %acc, <4 x float> %input) ; ordered reduction


	'``llvm.experimental.vector.reduce.mul.*``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> %a)
	declare i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> %a)

	Overview:
	"""""""""

	The '``llvm.experimental.vector.reduce.mul.*``' intrinsics do an integer ``MUL``
	reduction of a vector, returning the result as a scalar. The return type matches
	the element-type of the vector input.

	Arguments:
	""""""""""
	The argument to this intrinsic must be a vector of integer values.

	'``llvm.experimental.vector.reduce.fmul.*``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %acc, <4 x float> %a)
	declare double @llvm.experimental.vector.reduce.fmul.f64.v2f64(double %acc, <2 x double> %a)

	Overview:
	"""""""""

	The '``llvm.experimental.vector.reduce.fmul.*``' intrinsics do a floating point
	``MUL`` reduction of a vector, returning the result as a scalar. The return type
	matches the element-type of the vector input.

	If the intrinsic call has fast-math flags, then the reduction will not preserve
	the associativity of an equivalent scalarized counterpart. If it does not have
	fast-math flags, then the reduction will be ordered, implying that the
	operation respects the associativity of a scalarized reduction.


	Arguments:
	""""""""""
	The first argument to this intrinsic is a scalar accumulator value, which is
	only used when there are no fast-math flags attached. This argument may be undef
	when fast-math flags are used.

	The second argument must be a vector of floating point values.

	Examples:
	"""""""""

	.. code-block:: llvm

	%fast = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %input) ; fast reduction
	%ord = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %acc, <4 x float> %input) ; ordered reduction

	'``llvm.experimental.vector.reduce.and.*``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare i32 @llvm.experimental.vector.reduce.and.i32.v4i32(<4 x i32> %a)

	Overview:
	"""""""""

	The '``llvm.experimental.vector.reduce.and.*``' intrinsics do a bitwise ``AND``
	reduction of a vector, returning the result as a scalar. The return type matches
	the element-type of the vector input.

	Arguments:
	""""""""""
	The argument to this intrinsic must be a vector of integer values.

	'``llvm.experimental.vector.reduce.or.*``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare i32 @llvm.experimental.vector.reduce.or.i32.v4i32(<4 x i32> %a)

	Overview:
	"""""""""

	The '``llvm.experimental.vector.reduce.or.*``' intrinsics do a bitwise ``OR`` reduction
	of a vector, returning the result as a scalar. The return type matches the
	element-type of the vector input.

	Arguments:
	""""""""""
	The argument to this intrinsic must be a vector of integer values.

	'``llvm.experimental.vector.reduce.xor.*``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare i32 @llvm.experimental.vector.reduce.xor.i32.v4i32(<4 x i32> %a)

	Overview:
	"""""""""

	The '``llvm.experimental.vector.reduce.xor.*``' intrinsics do a bitwise ``XOR``
	reduction of a vector, returning the result as a scalar. The return type matches
	the element-type of the vector input.

	Arguments:
	""""""""""
	The argument to this intrinsic must be a vector of integer values.

	'``llvm.experimental.vector.reduce.smax.*``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32> %a)

	Overview:
	"""""""""

	The '``llvm.experimental.vector.reduce.smax.*``' intrinsics do a signed integer
	``MAX`` reduction of a vector, returning the result as a scalar. The return type
	matches the element-type of the vector input.

	Arguments:
	""""""""""
	The argument to this intrinsic must be a vector of integer values.

	'``llvm.experimental.vector.reduce.smin.*``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32> %a)

	Overview:
	"""""""""

	The '``llvm.experimental.vector.reduce.smin.*``' intrinsics do a signed integer
	``MIN`` reduction of a vector, returning the result as a scalar. The return type
	matches the element-type of the vector input.

	Arguments:
	""""""""""
	The argument to this intrinsic must be a vector of integer values.

	'``llvm.experimental.vector.reduce.umax.*``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32> %a)

	Overview:
	"""""""""

	The '``llvm.experimental.vector.reduce.umax.*``' intrinsics do an unsigned
	integer ``MAX`` reduction of a vector, returning the result as a scalar. The
	return type matches the element-type of the vector input.

	Arguments:
	""""""""""
	The argument to this intrinsic must be a vector of integer values.

	'``llvm.experimental.vector.reduce.umin.*``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32> %a)

	Overview:
	"""""""""

	The '``llvm.experimental.vector.reduce.umin.*``' intrinsics do an unsigned
	integer ``MIN`` reduction of a vector, returning the result as a scalar. The
	return type matches the element-type of the vector input.

	Arguments:
	""""""""""
	The argument to this intrinsic must be a vector of integer values.

	'``llvm.experimental.vector.reduce.fmax.*``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare float @llvm.experimental.vector.reduce.fmax.f32.v4f32(<4 x float> %a)
	declare double @llvm.experimental.vector.reduce.fmax.f64.v2f64(<2 x double> %a)

	Overview:
	"""""""""

	The '``llvm.experimental.vector.reduce.fmax.*``' intrinsics do a floating point
	``MAX`` reduction of a vector, returning the result as a scalar. The return type
	matches the element-type of the vector input.

	If the intrinsic call has the ``nnan`` fast-math flag then the operation can
	assume that NaNs are not present in the input vector.

	Arguments:
	""""""""""
	The argument to this intrinsic must be a vector of floating point values.

	'``llvm.experimental.vector.reduce.fmin.*``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare float @llvm.experimental.vector.reduce.fmin.f32.v4f32(<4 x float> %a)
	declare double @llvm.experimental.vector.reduce.fmin.f64.v2f64(<2 x double> %a)

	Overview:
	"""""""""

	The '``llvm.experimental.vector.reduce.fmin.*``' intrinsics do a floating point
	``MIN`` reduction of a vector, returning the result as a scalar. The return type
	matches the element-type of the vector input.

	If the intrinsic call has the ``nnan`` fast-math flag then the operation can
	assume that NaNs are not present in the input vector.

	Arguments:
	""""""""""
	The argument to this intrinsic must be a vector of floating point values.

	Half Precision Floating Point Intrinsics
	----------------------------------------

	For most target platforms, half precision floating point is a
	storage-only format. This means that it is a dense encoding (in memory)
	but does not support computation in the format.

	This means that code must first load the half-precision floating point
	value as an i16, then convert it to float with
	:ref:`llvm.convert.from.fp16 <int_convert_from_fp16>`. Computation can
	then be performed on the float value (including extending to double
	etc). To store the value back to memory, it is first converted to float
	if needed, then converted to i16 with
	:ref:`llvm.convert.to.fp16 <int_convert_to_fp16>`, then storing as an
	i16 value.

	.. _int_convert_to_fp16:

	'``llvm.convert.to.fp16``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare i16 @llvm.convert.to.fp16.f32(float %a)
	declare i16 @llvm.convert.to.fp16.f64(double %a)

	Overview:
	"""""""""

	The '``llvm.convert.to.fp16``' intrinsic function performs a conversion from a
	conventional floating point type to half precision floating point format.

	Arguments:
	""""""""""

	The intrinsic function contains single argument - the value to be
	converted.

	Semantics:
	""""""""""

	The '``llvm.convert.to.fp16``' intrinsic function performs a conversion from a
	conventional floating point format to half precision floating point format. The
	return value is an ``i16`` which contains the converted number.

	Examples:
	"""""""""

	.. code-block:: llvm

	%res = call i16 @llvm.convert.to.fp16.f32(float %a)
	store i16 %res, i16* @x, align 2

	.. _int_convert_from_fp16:

	'``llvm.convert.from.fp16``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare float @llvm.convert.from.fp16.f32(i16 %a)
	declare double @llvm.convert.from.fp16.f64(i16 %a)

	Overview:
	"""""""""

	The '``llvm.convert.from.fp16``' intrinsic function performs a
	conversion from half precision floating point format to single precision
	floating point format.

	Arguments:
	""""""""""

	The intrinsic function contains single argument - the value to be
	converted.

	Semantics:
	""""""""""

	The '``llvm.convert.from.fp16``' intrinsic function performs a
	conversion from half single precision floating point format to single
	precision floating point format. The input half-float value is
	represented by an ``i16`` value.

	Examples:
	"""""""""

	.. code-block:: llvm

	%a = load i16, i16* @x, align 2
	%res = call float @llvm.convert.from.fp16(i16 %a)

	.. _dbg_intrinsics:

	Debugger Intrinsics
	-------------------

	The LLVM debugger intrinsics (which all start with ``llvm.dbg.``
	prefix), are described in the `LLVM Source Level
	Debugging <SourceLevelDebugging.html#format_common_intrinsics>`_
	document.

	Exception Handling Intrinsics
	-----------------------------

	The LLVM exception handling intrinsics (which all start with
	``llvm.eh.`` prefix), are described in the `LLVM Exception
	Handling <ExceptionHandling.html#format_common_intrinsics>`_ document.

	.. _int_trampoline:

	Trampoline Intrinsics
	---------------------

	These intrinsics make it possible to excise one parameter, marked with
	the :ref:`nest <nest>` attribute, from a function. The result is a
	callable function pointer lacking the nest parameter - the caller does
	not need to provide a value for it. Instead, the value to use is stored
	in advance in a "trampoline", a block of memory usually allocated on the
	stack, which also contains code to splice the nest value into the
	argument list. This is used to implement the GCC nested function address
	extension.

	For example, if the function is ``i32 f(i8* nest %c, i32 %x, i32 %y)``
	then the resulting function pointer has signature ``i32 (i32, i32)*``.
	It can be created as follows:

	.. code-block:: llvm

	%tramp = alloca [10 x i8], align 4 ; size and alignment only correct for X86
	%tramp1 = getelementptr [10 x i8], [10 x i8]* %tramp, i32 0, i32 0
	call i8* @llvm.init.trampoline(i8* %tramp1, i8* bitcast (i32 (i8, i32, i32) @f to i8), i8 %nval)
	%p = call i8* @llvm.adjust.trampoline(i8* %tramp1)
	%fp = bitcast i8* %p to i32 (i32, i32)*

	The call ``%val = call i32 %fp(i32 %x, i32 %y)`` is then equivalent to
	``%val = call i32 %f(i8* %nval, i32 %x, i32 %y)``.

	.. _int_it:

	'``llvm.init.trampoline``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare void @llvm.init.trampoline(i8* <tramp>, i8* <func>, i8* <nval>)

	Overview:
	"""""""""

	This fills the memory pointed to by ``tramp`` with executable code,
	turning it into a trampoline.

	Arguments:
	""""""""""

	The ``llvm.init.trampoline`` intrinsic takes three arguments, all
	pointers. The ``tramp`` argument must point to a sufficiently large and
	sufficiently aligned block of memory; this memory is written to by the
	intrinsic. Note that the size and the alignment are target-specific -
	LLVM currently provides no portable way of determining them, so a
	front-end that generates this intrinsic needs to have some
	target-specific knowledge. The ``func`` argument must hold a function
	bitcast to an ``i8*``.

	Semantics:
	""""""""""

	The block of memory pointed to by ``tramp`` is filled with target
	dependent code, turning it into a function. Then ``tramp`` needs to be
	passed to :ref:`llvm.adjust.trampoline <int_at>` to get a pointer which can
	be :ref:`bitcast (to a new function) and called <int_trampoline>`. The new
	function's signature is the same as that of ``func`` with any arguments
	marked with the ``nest`` attribute removed. At most one such ``nest``
	argument is allowed, and it must be of pointer type. Calling the new
	function is equivalent to calling ``func`` with the same argument list,
	but with ``nval`` used for the missing ``nest`` argument. If, after
	calling ``llvm.init.trampoline``, the memory pointed to by ``tramp`` is
	modified, then the effect of any later call to the returned function
	pointer is undefined.

	.. _int_at:

	'``llvm.adjust.trampoline``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare i8* @llvm.adjust.trampoline(i8* <tramp>)

	Overview:
	"""""""""

	This performs any required machine-specific adjustment to the address of
	a trampoline (passed as ``tramp``).

	Arguments:
	""""""""""

	``tramp`` must point to a block of memory which already has trampoline
	code filled in by a previous call to
	:ref:`llvm.init.trampoline <int_it>`.

	Semantics:
	""""""""""

	On some architectures the address of the code to be executed needs to be
	different than the address where the trampoline is actually stored. This
	intrinsic returns the executable address corresponding to ``tramp``
	after performing the required machine specific adjustments. The pointer
	returned can then be :ref:`bitcast and executed <int_trampoline>`.

	.. _int_mload_mstore:

	Masked Vector Load and Store Intrinsics
	---------------------------------------

	LLVM provides intrinsics for predicated vector load and store operations. The predicate is specified by a mask operand, which holds one bit per vector element, switching the associated vector lane on or off. The memory addresses corresponding to the "off" lanes are not accessed. When all bits of the mask are on, the intrinsic is identical to a regular vector load or store. When all bits are off, no memory is accessed.

	.. _int_mload:

	'``llvm.masked.load.*``' Intrinsics
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""
	This is an overloaded intrinsic. The loaded data is a vector of any integer, floating point or pointer data type.

	::

	declare <16 x float> @llvm.masked.load.v16f32.p0v16f32 (<16 x float>* <ptr>, i32 <alignment>, <16 x i1> <mask>, <16 x float> <passthru>)
	declare <2 x double> @llvm.masked.load.v2f64.p0v2f64 (<2 x double>* <ptr>, i32 <alignment>, <2 x i1> <mask>, <2 x double> <passthru>)
	;; The data is a vector of pointers to double
	declare <8 x double> @llvm.masked.load.v8p0f64.p0v8p0f64 (<8 x double>* <ptr>, i32 <alignment>, <8 x i1> <mask>, <8 x double*> <passthru>)
	;; The data is a vector of function pointers
	declare <8 x i32 ()> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f (<8 x i32 ()>* <ptr>, i32 <alignment>, <8 x i1> <mask>, <8 x i32 ()*> <passthru>)

	Overview:
	"""""""""

	Reads a vector from memory according to the provided mask. The mask holds a bit for each vector lane, and is used to prevent memory accesses to the masked-off lanes. The masked-off lanes in the result vector are taken from the corresponding lanes of the '``passthru``' operand.


	Arguments:
	""""""""""

	The first operand is the base pointer for the load. The second operand is the alignment of the source location. It must be a constant integer value. The third operand, mask, is a vector of boolean values with the same number of elements as the return type. The fourth is a pass-through value that is used to fill the masked-off lanes of the result. The return type, underlying type of the base pointer and the type of the '``passthru``' operand are the same vector types.


	Semantics:
	""""""""""

	The '``llvm.masked.load``' intrinsic is designed for conditional reading of selected vector elements in a single IR operation. It is useful for targets that support vector masked loads and allows vectorizing predicated basic blocks on these targets. Other targets may support this intrinsic differently, for example by lowering it into a sequence of branches that guard scalar load operations.
	The result of this operation is equivalent to a regular vector load instruction followed by a 'select' between the loaded and the passthru values, predicated on the same mask. However, using this intrinsic prevents exceptions on memory access to masked-off lanes.


	::

	%res = call <16 x float> @llvm.masked.load.v16f32.p0v16f32 (<16 x float>* %ptr, i32 4, <16 x i1>%mask, <16 x float> %passthru)

	;; The result of the two following instructions is identical aside from potential memory access exception
	%loadlal = load <16 x float>, <16 x float>* %ptr, align 4
	%res = select <16 x i1> %mask, <16 x float> %loadlal, <16 x float> %passthru

	.. _int_mstore:

	'``llvm.masked.store.*``' Intrinsics
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""
	This is an overloaded intrinsic. The data stored in memory is a vector of any integer, floating point or pointer data type.

	::

	declare void @llvm.masked.store.v8i32.p0v8i32 (<8 x i32> <value>, <8 x i32>* <ptr>, i32 <alignment>, <8 x i1> <mask>)
	declare void @llvm.masked.store.v16f32.p0v16f32 (<16 x float> <value>, <16 x float>* <ptr>, i32 <alignment>, <16 x i1> <mask>)
	;; The data is a vector of pointers to double
	declare void @llvm.masked.store.v8p0f64.p0v8p0f64 (<8 x double> <value>, <8 x double>* <ptr>, i32 <alignment>, <8 x i1> <mask>)
	;; The data is a vector of function pointers
	declare void @llvm.masked.store.v4p0f_i32f.p0v4p0f_i32f (<4 x i32 ()> <value>, <4 x i32 ()>* <ptr>, i32 <alignment>, <4 x i1> <mask>)

	Overview:
	"""""""""

	Writes a vector to memory according to the provided mask. The mask holds a bit for each vector lane, and is used to prevent memory accesses to the masked-off lanes.

	Arguments:
	""""""""""

	The first operand is the vector value to be written to memory. The second operand is the base pointer for the store, it has the same underlying type as the value operand. The third operand is the alignment of the destination location. The fourth operand, mask, is a vector of boolean values. The types of the mask and the value operand must have the same number of vector elements.


	Semantics:
	""""""""""

	The '``llvm.masked.store``' intrinsics is designed for conditional writing of selected vector elements in a single IR operation. It is useful for targets that support vector masked store and allows vectorizing predicated basic blocks on these targets. Other targets may support this intrinsic differently, for example by lowering it into a sequence of branches that guard scalar store operations.
	The result of this operation is equivalent to a load-modify-store sequence. However, using this intrinsic prevents exceptions and data races on memory access to masked-off lanes.

	::

	call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> %value, <16 x float>* %ptr, i32 4, <16 x i1> %mask)

	;; The result of the following instructions is identical aside from potential data races and memory access exceptions
	%oldval = load <16 x float>, <16 x float>* %ptr, align 4
	%res = select <16 x i1> %mask, <16 x float> %value, <16 x float> %oldval
	store <16 x float> %res, <16 x float>* %ptr, align 4


	Masked Vector Gather and Scatter Intrinsics
	-------------------------------------------

	LLVM provides intrinsics for vector gather and scatter operations. They are similar to :ref:`Masked Vector Load and Store <int_mload_mstore>`, except they are designed for arbitrary memory accesses, rather than sequential memory accesses. Gather and scatter also employ a mask operand, which holds one bit per vector element, switching the associated vector lane on or off. The memory addresses corresponding to the "off" lanes are not accessed. When all bits are off, no memory is accessed.

	.. _int_mgather:

	'``llvm.masked.gather.*``' Intrinsics
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""
	This is an overloaded intrinsic. The loaded data are multiple scalar values of any integer, floating point or pointer data type gathered together into one vector.

	::

	declare <16 x float> @llvm.masked.gather.v16f32.v16p0f32 (<16 x float*> <ptrs>, i32 <alignment>, <16 x i1> <mask>, <16 x float> <passthru>)
	declare <2 x double> @llvm.masked.gather.v2f64.v2p1f64 (<2 x double addrspace(1)*> <ptrs>, i32 <alignment>, <2 x i1> <mask>, <2 x double> <passthru>)
	declare <8 x float> @llvm.masked.gather.v8p0f32.v8p0p0f32 (<8 x float> <ptrs>, i32 <alignment>, <8 x i1> <mask>, <8 x float> <passthru>)

	Overview:
	"""""""""

	Reads scalar values from arbitrary memory locations and gathers them into one vector. The memory locations are provided in the vector of pointers '``ptrs``'. The memory is accessed according to the provided mask. The mask holds a bit for each vector lane, and is used to prevent memory accesses to the masked-off lanes. The masked-off lanes in the result vector are taken from the corresponding lanes of the '``passthru``' operand.


	Arguments:
	""""""""""

	The first operand is a vector of pointers which holds all memory addresses to read. The second operand is an alignment of the source addresses. It must be a constant integer value. The third operand, mask, is a vector of boolean values with the same number of elements as the return type. The fourth is a pass-through value that is used to fill the masked-off lanes of the result. The return type, underlying type of the vector of pointers and the type of the '``passthru``' operand are the same vector types.


	Semantics:
	""""""""""

	The '``llvm.masked.gather``' intrinsic is designed for conditional reading of multiple scalar values from arbitrary memory locations in a single IR operation. It is useful for targets that support vector masked gathers and allows vectorizing basic blocks with data and control divergence. Other targets may support this intrinsic differently, for example by lowering it into a sequence of scalar load operations.
	The semantics of this operation are equivalent to a sequence of conditional scalar loads with subsequent gathering all loaded values into a single vector. The mask restricts memory access to certain lanes and facilitates vectorization of predicated basic blocks.


	::

	%res = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64 (<4 x double*> %ptrs, i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x double> undef)

	;; The gather with all-true mask is equivalent to the following instruction sequence
	%ptr0 = extractelement <4 x double*> %ptrs, i32 0
	%ptr1 = extractelement <4 x double*> %ptrs, i32 1
	%ptr2 = extractelement <4 x double*> %ptrs, i32 2
	%ptr3 = extractelement <4 x double*> %ptrs, i32 3

	%val0 = load double, double* %ptr0, align 8
	%val1 = load double, double* %ptr1, align 8
	%val2 = load double, double* %ptr2, align 8
	%val3 = load double, double* %ptr3, align 8

	%vec0 = insertelement <4 x double>undef, %val0, 0
	%vec01 = insertelement <4 x double>%vec0, %val1, 1
	%vec012 = insertelement <4 x double>%vec01, %val2, 2
	%vec0123 = insertelement <4 x double>%vec012, %val3, 3

	.. _int_mscatter:

	'``llvm.masked.scatter.*``' Intrinsics
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""
	This is an overloaded intrinsic. The data stored in memory is a vector of any integer, floating point or pointer data type. Each vector element is stored in an arbitrary memory address. Scatter with overlapping addresses is guaranteed to be ordered from least-significant to most-significant element.

	::

	declare void @llvm.masked.scatter.v8i32.v8p0i32 (<8 x i32> <value>, <8 x i32*> <ptrs>, i32 <alignment>, <8 x i1> <mask>)
	declare void @llvm.masked.scatter.v16f32.v16p1f32 (<16 x float> <value>, <16 x float addrspace(1)*> <ptrs>, i32 <alignment>, <16 x i1> <mask>)
	declare void @llvm.masked.scatter.v4p0f64.v4p0p0f64 (<4 x double> <value>, <4 x double*> <ptrs>, i32 <alignment>, <4 x i1> <mask>)

	Overview:
	"""""""""

	Writes each element from the value vector to the corresponding memory address. The memory addresses are represented as a vector of pointers. Writing is done according to the provided mask. The mask holds a bit for each vector lane, and is used to prevent memory accesses to the masked-off lanes.

	Arguments:
	""""""""""

	The first operand is a vector value to be written to memory. The second operand is a vector of pointers, pointing to where the value elements should be stored. It has the same underlying type as the value operand. The third operand is an alignment of the destination addresses. The fourth operand, mask, is a vector of boolean values. The types of the mask and the value operand must have the same number of vector elements.


	Semantics:
	""""""""""

	The '``llvm.masked.scatter``' intrinsics is designed for writing selected vector elements to arbitrary memory addresses in a single IR operation. The operation may be conditional, when not all bits in the mask are switched on. It is useful for targets that support vector masked scatter and allows vectorizing basic blocks with data and control divergence. Other targets may support this intrinsic differently, for example by lowering it into a sequence of branches that guard scalar store operations.

	::

	;; This instruction unconditionally stores data vector in multiple addresses
	call @llvm.masked.scatter.v8i32.v8p0i32 (<8 x i32> %value, <8 x i32*> %ptrs, i32 4, <8 x i1> <true, true, .. true>)

	;; It is equivalent to a list of scalar stores
	%val0 = extractelement <8 x i32> %value, i32 0
	%val1 = extractelement <8 x i32> %value, i32 1
	..
	%val7 = extractelement <8 x i32> %value, i32 7
	%ptr0 = extractelement <8 x i32*> %ptrs, i32 0
	%ptr1 = extractelement <8 x i32*> %ptrs, i32 1
	..
	%ptr7 = extractelement <8 x i32*> %ptrs, i32 7
	;; Note: the order of the following stores is important when they overlap:
	store i32 %val0, i32* %ptr0, align 4
	store i32 %val1, i32* %ptr1, align 4
	..
	store i32 %val7, i32* %ptr7, align 4


	Memory Use Markers
	------------------

	This class of intrinsics provides information about the lifetime of
	memory objects and ranges where variables are immutable.

	.. _int_lifestart:

	'``llvm.lifetime.start``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare void @llvm.lifetime.start(i64 <size>, i8* nocapture <ptr>)

	Overview:
	"""""""""

	The '``llvm.lifetime.start``' intrinsic specifies the start of a memory
	object's lifetime.

	Arguments:
	""""""""""

	The first argument is a constant integer representing the size of the
	object, or -1 if it is variable sized. The second argument is a pointer
	to the object.

	Semantics:
	""""""""""

	This intrinsic indicates that before this point in the code, the value
	of the memory pointed to by ``ptr`` is dead. This means that it is known
	to never be used and has an undefined value. A load from the pointer
	that precedes this intrinsic can be replaced with ``'undef'``.

	.. _int_lifeend:

	'``llvm.lifetime.end``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare void @llvm.lifetime.end(i64 <size>, i8* nocapture <ptr>)

	Overview:
	"""""""""

	The '``llvm.lifetime.end``' intrinsic specifies the end of a memory
	object's lifetime.

	Arguments:
	""""""""""

	The first argument is a constant integer representing the size of the
	object, or -1 if it is variable sized. The second argument is a pointer
	to the object.

	Semantics:
	""""""""""

	This intrinsic indicates that after this point in the code, the value of
	the memory pointed to by ``ptr`` is dead. This means that it is known to
	never be used and has an undefined value. Any stores into the memory
	object following this intrinsic may be removed as dead.

	'``llvm.invariant.start``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""
	This is an overloaded intrinsic. The memory object can belong to any address space.

	::

	declare {}* @llvm.invariant.start.p0i8(i64 <size>, i8* nocapture <ptr>)

	Overview:
	"""""""""

	The '``llvm.invariant.start``' intrinsic specifies that the contents of
	a memory object will not change.

	Arguments:
	""""""""""

	The first argument is a constant integer representing the size of the
	object, or -1 if it is variable sized. The second argument is a pointer
	to the object.

	Semantics:
	""""""""""

	This intrinsic indicates that until an ``llvm.invariant.end`` that uses
	the return value, the referenced memory location is constant and
	unchanging.

	'``llvm.invariant.end``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""
	This is an overloaded intrinsic. The memory object can belong to any address space.

	::

	declare void @llvm.invariant.end.p0i8({}* <start>, i64 <size>, i8* nocapture <ptr>)

	Overview:
	"""""""""

	The '``llvm.invariant.end``' intrinsic specifies that the contents of a
	memory object are mutable.

	Arguments:
	""""""""""

	The first argument is the matching ``llvm.invariant.start`` intrinsic.
	The second argument is a constant integer representing the size of the
	object, or -1 if it is variable sized and the third argument is a
	pointer to the object.

	Semantics:
	""""""""""

	This intrinsic indicates that the memory is mutable again.

	'``llvm.invariant.group.barrier``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare i8* @llvm.invariant.group.barrier(i8* <ptr>)

	Overview:
	"""""""""

	The '``llvm.invariant.group.barrier``' intrinsic can be used when an invariant
	established by invariant.group metadata no longer holds, to obtain a new pointer
	value that does not carry the invariant information.


	Arguments:
	""""""""""

	The ``llvm.invariant.group.barrier`` takes only one argument, which is
	the pointer to the memory for which the ``invariant.group`` no longer holds.

	Semantics:
	""""""""""

	Returns another pointer that aliases its argument but which is considered different
	for the purposes of ``load``/``store`` ``invariant.group`` metadata.

	Constrained Floating Point Intrinsics
	-------------------------------------

	These intrinsics are used to provide special handling of floating point
	operations when specific rounding mode or floating point exception behavior is
	required. By default, LLVM optimization passes assume that the rounding mode is
	round-to-nearest and that floating point exceptions will not be monitored.
	Constrained FP intrinsics are used to support non-default rounding modes and
	accurately preserve exception behavior without compromising LLVM's ability to
	optimize FP code when the default behavior is used.

	Each of these intrinsics corresponds to a normal floating point operation. The
	first two arguments and the return value are the same as the corresponding FP
	operation.

	The third argument is a metadata argument specifying the rounding mode to be
	assumed. This argument must be one of the following strings:

	::

	"round.dynamic"
	"round.tonearest"
	"round.downward"
	"round.upward"
	"round.towardzero"

	If this argument is "round.dynamic" optimization passes must assume that the
	rounding mode is unknown and may change at runtime. No transformations that
	depend on rounding mode may be performed in this case.

	The other possible values for the rounding mode argument correspond to the
	similarly named IEEE rounding modes. If the argument is any of these values
	optimization passes may perform transformations as long as they are consistent
	with the specified rounding mode.

	For example, 'x-0'->'x' is not a valid transformation if the rounding mode is
	"round.downward" or "round.dynamic" because if the value of 'x' is +0 then
	'x-0' should evaluate to '-0' when rounding downward. However, this
	transformation is legal for all other rounding modes.

	For values other than "round.dynamic" optimization passes may assume that the
	actual runtime rounding mode (as defined in a target-specific manner) matches
	the specified rounding mode, but this is not guaranteed. Using a specific
	non-dynamic rounding mode which does not match the actual rounding mode at
	runtime results in undefined behavior.

	The fourth argument to the constrained floating point intrinsics specifies the
	required exception behavior. This argument must be one of the following
	strings:

	::

	"fpexcept.ignore"
	"fpexcept.maytrap"
	"fpexcept.strict"

	If this argument is "fpexcept.ignore" optimization passes may assume that the
	exception status flags will not be read and that floating point exceptions will
	be masked. This allows transformations to be performed that may change the
	exception semantics of the original code. For example, FP operations may be
	speculatively executed in this case whereas they must not be for either of the
	other possible values of this argument.

	If the exception behavior argument is "fpexcept.maytrap" optimization passes
	must avoid transformations that may raise exceptions that would not have been
	raised by the original code (such as speculatively executing FP operations), but
	passes are not required to preserve all exceptions that are implied by the
	original code. For example, exceptions may be potentially hidden by constant
	folding.

	If the exception behavior argument is "fpexcept.strict" all transformations must
	strictly preserve the floating point exception semantics of the original code.
	Any FP exception that would have been raised by the original code must be raised
	by the transformed code, and the transformed code must not raise any FP
	exceptions that would not have been raised by the original code. This is the
	exception behavior argument that will be used if the code being compiled reads
	the FP exception status flags, but this mode can also be used with code that
	unmasks FP exceptions.

	The number and order of floating point exceptions is NOT guaranteed. For
	example, a series of FP operations that each may raise exceptions may be
	vectorized into a single instruction that raises each unique exception a single
	time.


	'``llvm.experimental.constrained.fadd``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare <type>
	@llvm.experimental.constrained.fadd(<type> <op1>, <type> <op2>,
	metadata <rounding mode>,
	metadata <exception behavior>)

	Overview:
	"""""""""

	The '``llvm.experimental.constrained.fadd``' intrinsic returns the sum of its
	two operands.


	Arguments:
	""""""""""

	The first two arguments to the '``llvm.experimental.constrained.fadd``'
	intrinsic must be :ref:`floating point <t_floating>` or :ref:`vector <t_vector>`
	of floating point values. Both arguments must have identical types.

	The third and fourth arguments specify the rounding mode and exception
	behavior as described above.

	Semantics:
	""""""""""

	The value produced is the floating point sum of the two value operands and has
	the same type as the operands.


	'``llvm.experimental.constrained.fsub``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare <type>
	@llvm.experimental.constrained.fsub(<type> <op1>, <type> <op2>,
	metadata <rounding mode>,
	metadata <exception behavior>)

	Overview:
	"""""""""

	The '``llvm.experimental.constrained.fsub``' intrinsic returns the difference
	of its two operands.


	Arguments:
	""""""""""

	The first two arguments to the '``llvm.experimental.constrained.fsub``'
	intrinsic must be :ref:`floating point <t_floating>` or :ref:`vector <t_vector>`
	of floating point values. Both arguments must have identical types.

	The third and fourth arguments specify the rounding mode and exception
	behavior as described above.

	Semantics:
	""""""""""

	The value produced is the floating point difference of the two value operands
	and has the same type as the operands.


	'``llvm.experimental.constrained.fmul``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare <type>
	@llvm.experimental.constrained.fmul(<type> <op1>, <type> <op2>,
	metadata <rounding mode>,
	metadata <exception behavior>)

	Overview:
	"""""""""

	The '``llvm.experimental.constrained.fmul``' intrinsic returns the product of
	its two operands.


	Arguments:
	""""""""""

	The first two arguments to the '``llvm.experimental.constrained.fmul``'
	intrinsic must be :ref:`floating point <t_floating>` or :ref:`vector <t_vector>`
	of floating point values. Both arguments must have identical types.

	The third and fourth arguments specify the rounding mode and exception
	behavior as described above.

	Semantics:
	""""""""""

	The value produced is the floating point product of the two value operands and
	has the same type as the operands.


	'``llvm.experimental.constrained.fdiv``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare <type>
	@llvm.experimental.constrained.fdiv(<type> <op1>, <type> <op2>,
	metadata <rounding mode>,
	metadata <exception behavior>)

	Overview:
	"""""""""

	The '``llvm.experimental.constrained.fdiv``' intrinsic returns the quotient of
	its two operands.


	Arguments:
	""""""""""

	The first two arguments to the '``llvm.experimental.constrained.fdiv``'
	intrinsic must be :ref:`floating point <t_floating>` or :ref:`vector <t_vector>`
	of floating point values. Both arguments must have identical types.

	The third and fourth arguments specify the rounding mode and exception
	behavior as described above.

	Semantics:
	""""""""""

	The value produced is the floating point quotient of the two value operands and
	has the same type as the operands.


	'``llvm.experimental.constrained.frem``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare <type>
	@llvm.experimental.constrained.frem(<type> <op1>, <type> <op2>,
	metadata <rounding mode>,
	metadata <exception behavior>)

	Overview:
	"""""""""

	The '``llvm.experimental.constrained.frem``' intrinsic returns the remainder
	from the division of its two operands.


	Arguments:
	""""""""""

	The first two arguments to the '``llvm.experimental.constrained.frem``'
	intrinsic must be :ref:`floating point <t_floating>` or :ref:`vector <t_vector>`
	of floating point values. Both arguments must have identical types.

	The third and fourth arguments specify the rounding mode and exception
	behavior as described above. The rounding mode argument has no effect, since
	the result of frem is never rounded, but the argument is included for
	consistency with the other constrained floating point intrinsics.

	Semantics:
	""""""""""

	The value produced is the floating point remainder from the division of the two
	value operands and has the same type as the operands. The remainder has the
	same sign as the dividend.


	Constrained libm-equivalent Intrinsics
	--------------------------------------

	In addition to the basic floating point operations for which constrained
	intrinsics are described above, there are constrained versions of various
	operations which provide equivalent behavior to a corresponding libm function.
	These intrinsics allow the precise behavior of these operations with respect to
	rounding mode and exception behavior to be controlled.

	As with the basic constrained floating point intrinsics, the rounding mode
	and exception behavior arguments only control the behavior of the optimizer.
	They do not change the runtime floating point environment.


	'``llvm.experimental.constrained.sqrt``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare <type>
	@llvm.experimental.constrained.sqrt(<type> <op1>,
	metadata <rounding mode>,
	metadata <exception behavior>)

	Overview:
	"""""""""

	The '``llvm.experimental.constrained.sqrt``' intrinsic returns the square root
	of the specified value, returning the same value as the libm '``sqrt``'
	functions would, but without setting ``errno``.

	Arguments:
	""""""""""

	The first argument and the return type are floating point numbers of the same
	type.

	The second and third arguments specify the rounding mode and exception
	behavior as described above.

	Semantics:
	""""""""""

	This function returns the nonnegative square root of the specified value.
	If the value is less than negative zero, a floating point exception occurs
	and the the return value is architecture specific.


	'``llvm.experimental.constrained.pow``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare <type>
	@llvm.experimental.constrained.pow(<type> <op1>, <type> <op2>,
	metadata <rounding mode>,
	metadata <exception behavior>)

	Overview:
	"""""""""

	The '``llvm.experimental.constrained.pow``' intrinsic returns the first operand
	raised to the (positive or negative) power specified by the second operand.

	Arguments:
	""""""""""

	The first two arguments and the return value are floating point numbers of the
	same type. The second argument specifies the power to which the first argument
	should be raised.

	The third and fourth arguments specify the rounding mode and exception
	behavior as described above.

	Semantics:
	""""""""""

	This function returns the first value raised to the second power,
	returning the same values as the libm ``pow`` functions would, and
	handles error conditions in the same way.


	'``llvm.experimental.constrained.powi``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare <type>
	@llvm.experimental.constrained.powi(<type> <op1>, i32 <op2>,
	metadata <rounding mode>,
	metadata <exception behavior>)

	Overview:
	"""""""""

	The '``llvm.experimental.constrained.powi``' intrinsic returns the first operand
	raised to the (positive or negative) power specified by the second operand. The
	order of evaluation of multiplications is not defined. When a vector of floating
	point type is used, the second argument remains a scalar integer value.


	Arguments:
	""""""""""

	The first argument and the return value are floating point numbers of the same
	type. The second argument is a 32-bit signed integer specifying the power to
	which the first argument should be raised.

	The third and fourth arguments specify the rounding mode and exception
	behavior as described above.

	Semantics:
	""""""""""

	This function returns the first value raised to the second power with an
	unspecified sequence of rounding operations.


	'``llvm.experimental.constrained.sin``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare <type>
	@llvm.experimental.constrained.sin(<type> <op1>,
	metadata <rounding mode>,
	metadata <exception behavior>)

	Overview:
	"""""""""

	The '``llvm.experimental.constrained.sin``' intrinsic returns the sine of the
	first operand.

	Arguments:
	""""""""""

	The first argument and the return type are floating point numbers of the same
	type.

	The second and third arguments specify the rounding mode and exception
	behavior as described above.

	Semantics:
	""""""""""

	This function returns the sine of the specified operand, returning the
	same values as the libm ``sin`` functions would, and handles error
	conditions in the same way.


	'``llvm.experimental.constrained.cos``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare <type>
	@llvm.experimental.constrained.cos(<type> <op1>,
	metadata <rounding mode>,
	metadata <exception behavior>)

	Overview:
	"""""""""

	The '``llvm.experimental.constrained.cos``' intrinsic returns the cosine of the
	first operand.

	Arguments:
	""""""""""

	The first argument and the return type are floating point numbers of the same
	type.

	The second and third arguments specify the rounding mode and exception
	behavior as described above.

	Semantics:
	""""""""""

	This function returns the cosine of the specified operand, returning the
	same values as the libm ``cos`` functions would, and handles error
	conditions in the same way.


	'``llvm.experimental.constrained.exp``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare <type>
	@llvm.experimental.constrained.exp(<type> <op1>,
	metadata <rounding mode>,
	metadata <exception behavior>)

	Overview:
	"""""""""

	The '``llvm.experimental.constrained.exp``' intrinsic computes the base-e
	exponential of the specified value.

	Arguments:
	""""""""""

	The first argument and the return value are floating point numbers of the same
	type.

	The second and third arguments specify the rounding mode and exception
	behavior as described above.

	Semantics:
	""""""""""

	This function returns the same values as the libm ``exp`` functions
	would, and handles error conditions in the same way.


	'``llvm.experimental.constrained.exp2``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare <type>
	@llvm.experimental.constrained.exp2(<type> <op1>,
	metadata <rounding mode>,
	metadata <exception behavior>)

	Overview:
	"""""""""

	The '``llvm.experimental.constrained.exp2``' intrinsic computes the base-2
	exponential of the specified value.


	Arguments:
	""""""""""

	The first argument and the return value are floating point numbers of the same
	type.

	The second and third arguments specify the rounding mode and exception
	behavior as described above.

	Semantics:
	""""""""""

	This function returns the same values as the libm ``exp2`` functions
	would, and handles error conditions in the same way.


	'``llvm.experimental.constrained.log``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare <type>
	@llvm.experimental.constrained.log(<type> <op1>,
	metadata <rounding mode>,
	metadata <exception behavior>)

	Overview:
	"""""""""

	The '``llvm.experimental.constrained.log``' intrinsic computes the base-e
	logarithm of the specified value.

	Arguments:
	""""""""""

	The first argument and the return value are floating point numbers of the same
	type.

	The second and third arguments specify the rounding mode and exception
	behavior as described above.


	Semantics:
	""""""""""

	This function returns the same values as the libm ``log`` functions
	would, and handles error conditions in the same way.


	'``llvm.experimental.constrained.log10``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare <type>
	@llvm.experimental.constrained.log10(<type> <op1>,
	metadata <rounding mode>,
	metadata <exception behavior>)

	Overview:
	"""""""""

	The '``llvm.experimental.constrained.log10``' intrinsic computes the base-10
	logarithm of the specified value.

	Arguments:
	""""""""""

	The first argument and the return value are floating point numbers of the same
	type.

	The second and third arguments specify the rounding mode and exception
	behavior as described above.

	Semantics:
	""""""""""

	This function returns the same values as the libm ``log10`` functions
	would, and handles error conditions in the same way.


	'``llvm.experimental.constrained.log2``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare <type>
	@llvm.experimental.constrained.log2(<type> <op1>,
	metadata <rounding mode>,
	metadata <exception behavior>)

	Overview:
	"""""""""

	The '``llvm.experimental.constrained.log2``' intrinsic computes the base-2
	logarithm of the specified value.

	Arguments:
	""""""""""

	The first argument and the return value are floating point numbers of the same
	type.

	The second and third arguments specify the rounding mode and exception
	behavior as described above.

	Semantics:
	""""""""""

	This function returns the same values as the libm ``log2`` functions
	would, and handles error conditions in the same way.


	'``llvm.experimental.constrained.rint``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare <type>
	@llvm.experimental.constrained.rint(<type> <op1>,
	metadata <rounding mode>,
	metadata <exception behavior>)

	Overview:
	"""""""""

	The '``llvm.experimental.constrained.rint``' intrinsic returns the first
	operand rounded to the nearest integer. It may raise an inexact floating point
	exception if the operand is not an integer.

	Arguments:
	""""""""""

	The first argument and the return value are floating point numbers of the same
	type.

	The second and third arguments specify the rounding mode and exception
	behavior as described above.

	Semantics:
	""""""""""

	This function returns the same values as the libm ``rint`` functions
	would, and handles error conditions in the same way. The rounding mode is
	described, not determined, by the rounding mode argument. The actual rounding
	mode is determined by the runtime floating point environment. The rounding
	mode argument is only intended as information to the compiler.


	'``llvm.experimental.constrained.nearbyint``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare <type>
	@llvm.experimental.constrained.nearbyint(<type> <op1>,
	metadata <rounding mode>,
	metadata <exception behavior>)

	Overview:
	"""""""""

	The '``llvm.experimental.constrained.nearbyint``' intrinsic returns the first
	operand rounded to the nearest integer. It will not raise an inexact floating
	point exception if the operand is not an integer.


	Arguments:
	""""""""""

	The first argument and the return value are floating point numbers of the same
	type.

	The second and third arguments specify the rounding mode and exception
	behavior as described above.

	Semantics:
	""""""""""

	This function returns the same values as the libm ``nearbyint`` functions
	would, and handles error conditions in the same way. The rounding mode is
	described, not determined, by the rounding mode argument. The actual rounding
	mode is determined by the runtime floating point environment. The rounding
	mode argument is only intended as information to the compiler.


	General Intrinsics
	------------------

	This class of intrinsics is designed to be generic and has no specific
	purpose.

	'``llvm.var.annotation``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare void @llvm.var.annotation(i8* <val>, i8* <str>, i8* <str>, i32 <int>)

	Overview:
	"""""""""

	The '``llvm.var.annotation``' intrinsic.

	Arguments:
	""""""""""

	The first argument is a pointer to a value, the second is a pointer to a
	global string, the third is a pointer to a global string which is the
	source file name, and the last argument is the line number.

	Semantics:
	""""""""""

	This intrinsic allows annotation of local variables with arbitrary
	strings. This can be useful for special purpose optimizations that want
	to look for these annotations. These have no other defined use; they are
	ignored by code generation and optimization.

	'``llvm.ptr.annotation.*``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	This is an overloaded intrinsic. You can use '``llvm.ptr.annotation``' on a
	pointer to an integer of any width. NOTE you must specify an address space for
	the pointer. The identifier for the default address space is the integer
	'``0``'.

	::

	declare i8* @llvm.ptr.annotation.p<address space>i8(i8* <val>, i8* <str>, i8* <str>, i32 <int>)
	declare i16* @llvm.ptr.annotation.p<address space>i16(i16* <val>, i8* <str>, i8* <str>, i32 <int>)
	declare i32* @llvm.ptr.annotation.p<address space>i32(i32* <val>, i8* <str>, i8* <str>, i32 <int>)
	declare i64* @llvm.ptr.annotation.p<address space>i64(i64* <val>, i8* <str>, i8* <str>, i32 <int>)
	declare i256* @llvm.ptr.annotation.p<address space>i256(i256* <val>, i8* <str>, i8* <str>, i32 <int>)

	Overview:
	"""""""""

	The '``llvm.ptr.annotation``' intrinsic.

	Arguments:
	""""""""""

	The first argument is a pointer to an integer value of arbitrary bitwidth
	(result of some expression), the second is a pointer to a global string, the
	third is a pointer to a global string which is the source file name, and the
	last argument is the line number. It returns the value of the first argument.

	Semantics:
	""""""""""

	This intrinsic allows annotation of a pointer to an integer with arbitrary
	strings. This can be useful for special purpose optimizations that want to look
	for these annotations. These have no other defined use; they are ignored by code
	generation and optimization.

	'``llvm.annotation.*``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	This is an overloaded intrinsic. You can use '``llvm.annotation``' on
	any integer bit width.

	::

	declare i8 @llvm.annotation.i8(i8 <val>, i8* <str>, i8* <str>, i32 <int>)
	declare i16 @llvm.annotation.i16(i16 <val>, i8* <str>, i8* <str>, i32 <int>)
	declare i32 @llvm.annotation.i32(i32 <val>, i8* <str>, i8* <str>, i32 <int>)
	declare i64 @llvm.annotation.i64(i64 <val>, i8* <str>, i8* <str>, i32 <int>)
	declare i256 @llvm.annotation.i256(i256 <val>, i8* <str>, i8* <str>, i32 <int>)

	Overview:
	"""""""""

	The '``llvm.annotation``' intrinsic.

	Arguments:
	""""""""""

	The first argument is an integer value (result of some expression), the
	second is a pointer to a global string, the third is a pointer to a
	global string which is the source file name, and the last argument is
	the line number. It returns the value of the first argument.

	Semantics:
	""""""""""

	This intrinsic allows annotations to be put on arbitrary expressions
	with arbitrary strings. This can be useful for special purpose
	optimizations that want to look for these annotations. These have no
	other defined use; they are ignored by code generation and optimization.

	'``llvm.trap``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare void @llvm.trap() noreturn nounwind

	Overview:
	"""""""""

	The '``llvm.trap``' intrinsic.

	Arguments:
	""""""""""

	None.

	Semantics:
	""""""""""

	This intrinsic is lowered to the target dependent trap instruction. If
	the target does not have a trap instruction, this intrinsic will be
	lowered to a call of the ``abort()`` function.

	'``llvm.debugtrap``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare void @llvm.debugtrap() nounwind

	Overview:
	"""""""""

	The '``llvm.debugtrap``' intrinsic.

	Arguments:
	""""""""""

	None.

	Semantics:
	""""""""""

	This intrinsic is lowered to code which is intended to cause an
	execution trap with the intention of requesting the attention of a
	debugger.

	'``llvm.stackprotector``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare void @llvm.stackprotector(i8* <guard>, i8** <slot>)

	Overview:
	"""""""""

	The ``llvm.stackprotector`` intrinsic takes the ``guard`` and stores it
	onto the stack at ``slot``. The stack slot is adjusted to ensure that it
	is placed on the stack before local variables.

	Arguments:
	""""""""""

	The ``llvm.stackprotector`` intrinsic requires two pointer arguments.
	The first argument is the value loaded from the stack guard
	``@__stack_chk_guard``. The second variable is an ``alloca`` that has
	enough space to hold the value of the guard.

	Semantics:
	""""""""""

	This intrinsic causes the prologue/epilogue inserter to force the position of
	the ``AllocaInst`` stack slot to be before local variables on the stack. This is
	to ensure that if a local variable on the stack is overwritten, it will destroy
	the value of the guard. When the function exits, the guard on the stack is
	checked against the original guard by ``llvm.stackprotectorcheck``. If they are
	different, then ``llvm.stackprotectorcheck`` causes the program to abort by
	calling the ``__stack_chk_fail()`` function.

	'``llvm.stackguard``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare i8* @llvm.stackguard()

	Overview:
	"""""""""

	The ``llvm.stackguard`` intrinsic returns the system stack guard value.

	It should not be generated by frontends, since it is only for internal usage.
	The reason why we create this intrinsic is that we still support IR form Stack
	Protector in FastISel.

	Arguments:
	""""""""""

	None.

	Semantics:
	""""""""""

	On some platforms, the value returned by this intrinsic remains unchanged
	between loads in the same thread. On other platforms, it returns the same
	global variable value, if any, e.g. ``@__stack_chk_guard``.

	Currently some platforms have IR-level customized stack guard loading (e.g.
	X86 Linux) that is not handled by ``llvm.stackguard()``, while they should be
	in the future.

	'``llvm.objectsize``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare i32 @llvm.objectsize.i32(i8* <object>, i1 <min>, i1 <nullunknown>)
	declare i64 @llvm.objectsize.i64(i8* <object>, i1 <min>, i1 <nullunknown>)

	Overview:
	"""""""""

	The ``llvm.objectsize`` intrinsic is designed to provide information to
	the optimizers to determine at compile time whether a) an operation
	(like memcpy) will overflow a buffer that corresponds to an object, or
	b) that a runtime check for overflow isn't necessary. An object in this
	context means an allocation of a specific class, structure, array, or
	other object.

	Arguments:
	""""""""""

	The ``llvm.objectsize`` intrinsic takes three arguments. The first argument is
	a pointer to or into the ``object``. The second argument determines whether
	``llvm.objectsize`` returns 0 (if true) or -1 (if false) when the object size
	is unknown. The third argument controls how ``llvm.objectsize`` acts when
	``null`` is used as its pointer argument. If it's true and the pointer is in
	address space 0, ``null`` is treated as an opaque value with an unknown number
	of bytes. Otherwise, ``llvm.objectsize`` reports 0 bytes available when given
	``null``.

	The second and third arguments only accept constants.

	Semantics:
	""""""""""

	The ``llvm.objectsize`` intrinsic is lowered to a constant representing
	the size of the object concerned. If the size cannot be determined at
	compile time, ``llvm.objectsize`` returns ``i32/i64 -1 or 0`` (depending
	on the ``min`` argument).

	'``llvm.expect``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	This is an overloaded intrinsic. You can use ``llvm.expect`` on any
	integer bit width.

	::

	declare i1 @llvm.expect.i1(i1 <val>, i1 <expected_val>)
	declare i32 @llvm.expect.i32(i32 <val>, i32 <expected_val>)
	declare i64 @llvm.expect.i64(i64 <val>, i64 <expected_val>)

	Overview:
	"""""""""

	The ``llvm.expect`` intrinsic provides information about expected (the
	most probable) value of ``val``, which can be used by optimizers.

	Arguments:
	""""""""""

	The ``llvm.expect`` intrinsic takes two arguments. The first argument is
	a value. The second argument is an expected value, this needs to be a
	constant value, variables are not allowed.

	Semantics:
	""""""""""

	This intrinsic is lowered to the ``val``.

	.. _int_assume:

	'``llvm.assume``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare void @llvm.assume(i1 %cond)

	Overview:
	"""""""""

	The ``llvm.assume`` allows the optimizer to assume that the provided
	condition is true. This information can then be used in simplifying other parts
	of the code.

	Arguments:
	""""""""""

	The condition which the optimizer may assume is always true.

	Semantics:
	""""""""""

	The intrinsic allows the optimizer to assume that the provided condition is
	always true whenever the control flow reaches the intrinsic call. No code is
	generated for this intrinsic, and instructions that contribute only to the
	provided condition are not used for code generation. If the condition is
	violated during execution, the behavior is undefined.

	Note that the optimizer might limit the transformations performed on values
	used by the ``llvm.assume`` intrinsic in order to preserve the instructions
	only used to form the intrinsic's input argument. This might prove undesirable
	if the extra information provided by the ``llvm.assume`` intrinsic does not cause
	sufficient overall improvement in code quality. For this reason,
	``llvm.assume`` should not be used to document basic mathematical invariants
	that the optimizer can otherwise deduce or facts that are of little use to the
	optimizer.

	.. _int_ssa_copy:

	'``llvm.ssa_copy``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare type @llvm.ssa_copy(type %operand) returned(1) readnone

	Arguments:
	""""""""""

	The first argument is an operand which is used as the returned value.

	Overview:
	""""""""""

	The ``llvm.ssa_copy`` intrinsic can be used to attach information to
	operations by copying them and giving them new names. For example,
	the PredicateInfo utility uses it to build Extended SSA form, and
	attach various forms of information to operands that dominate specific
	uses. It is not meant for general use, only for building temporary
	renaming forms that require value splits at certain points.

	.. _type.test:

	'``llvm.type.test``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare i1 @llvm.type.test(i8* %ptr, metadata %type) nounwind readnone


	Arguments:
	""""""""""

	The first argument is a pointer to be tested. The second argument is a
	metadata object representing a :doc:`type identifier <TypeMetadata>`.

	Overview:
	"""""""""

	The ``llvm.type.test`` intrinsic tests whether the given pointer is associated
	with the given type identifier.

	'``llvm.type.checked.load``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare {i8, i1} @llvm.type.checked.load(i8 %ptr, i32 %offset, metadata %type) argmemonly nounwind readonly


	Arguments:
	""""""""""

	The first argument is a pointer from which to load a function pointer. The
	second argument is the byte offset from which to load the function pointer. The
	third argument is a metadata object representing a :doc:`type identifier
	<TypeMetadata>`.

	Overview:
	"""""""""

	The ``llvm.type.checked.load`` intrinsic safely loads a function pointer from a
	virtual table pointer using type metadata. This intrinsic is used to implement
	control flow integrity in conjunction with virtual call optimization. The
	virtual call optimization pass will optimize away ``llvm.type.checked.load``
	intrinsics associated with devirtualized calls, thereby removing the type
	check in cases where it is not needed to enforce the control flow integrity
	constraint.

	If the given pointer is associated with a type metadata identifier, this
	function returns true as the second element of its return value. (Note that
	the function may also return true if the given pointer is not associated
	with a type metadata identifier.) If the function's return value's second
	element is true, the following rules apply to the first element:

	- If the given pointer is associated with the given type metadata identifier,
	it is the function pointer loaded from the given byte offset from the given
	pointer.

	- If the given pointer is not associated with the given type metadata
	identifier, it is one of the following (the choice of which is unspecified):

	1. The function pointer that would have been loaded from an arbitrarily chosen
	(through an unspecified mechanism) pointer associated with the type
	metadata.

	2. If the function has a non-void return type, a pointer to a function that
	returns an unspecified value without causing side effects.

	If the function's return value's second element is false, the value of the
	first element is undefined.


	'``llvm.donothing``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare void @llvm.donothing() nounwind readnone

	Overview:
	"""""""""

	The ``llvm.donothing`` intrinsic doesn't perform any operation. It's one of only
	three intrinsics (besides ``llvm.experimental.patchpoint`` and
	``llvm.experimental.gc.statepoint``) that can be called with an invoke
	instruction.

	Arguments:
	""""""""""

	None.

	Semantics:
	""""""""""

	This intrinsic does nothing, and it's removed by optimizers and ignored
	by codegen.

	'``llvm.experimental.deoptimize``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare type @llvm.experimental.deoptimize(...) [ "deopt"(...) ]

	Overview:
	"""""""""

	This intrinsic, together with :ref:`deoptimization operand bundles
	<deopt_opbundles>`, allow frontends to express transfer of control and
	frame-local state from the currently executing (typically more specialized,
	hence faster) version of a function into another (typically more generic, hence
	slower) version.

	In languages with a fully integrated managed runtime like Java and JavaScript
	this intrinsic can be used to implement "uncommon trap" or "side exit" like
	functionality. In unmanaged languages like C and C++, this intrinsic can be
	used to represent the slow paths of specialized functions.


	Arguments:
	""""""""""

	The intrinsic takes an arbitrary number of arguments, whose meaning is
	decided by the :ref:`lowering strategy<deoptimize_lowering>`.

	Semantics:
	""""""""""

	The ``@llvm.experimental.deoptimize`` intrinsic executes an attached
	deoptimization continuation (denoted using a :ref:`deoptimization
	operand bundle <deopt_opbundles>`) and returns the value returned by
	the deoptimization continuation. Defining the semantic properties of
	the continuation itself is out of scope of the language reference --
	as far as LLVM is concerned, the deoptimization continuation can
	invoke arbitrary side effects, including reading from and writing to
	the entire heap.

	Deoptimization continuations expressed using ``"deopt"`` operand bundles always
	continue execution to the end of the physical frame containing them, so all
	calls to ``@llvm.experimental.deoptimize`` must be in "tail position":

	- ``@llvm.experimental.deoptimize`` cannot be invoked.
	- The call must immediately precede a :ref:`ret <i_ret>` instruction.
	- The ``ret`` instruction must return the value produced by the
	``@llvm.experimental.deoptimize`` call if there is one, or void.

	Note that the above restrictions imply that the return type for a call to
	``@llvm.experimental.deoptimize`` will match the return type of its immediate
	caller.

	The inliner composes the ``"deopt"`` continuations of the caller into the
	``"deopt"`` continuations present in the inlinee, and also updates calls to this
	intrinsic to return directly from the frame of the function it inlined into.

	All declarations of ``@llvm.experimental.deoptimize`` must share the
	same calling convention.

	.. _deoptimize_lowering:

	Lowering:
	"""""""""

	Calls to ``@llvm.experimental.deoptimize`` are lowered to calls to the
	symbol ``__llvm_deoptimize`` (it is the frontend's responsibility to
	ensure that this symbol is defined). The call arguments to
	``@llvm.experimental.deoptimize`` are lowered as if they were formal
	arguments of the specified types, and not as varargs.


	'``llvm.experimental.guard``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare void @llvm.experimental.guard(i1, ...) [ "deopt"(...) ]

	Overview:
	"""""""""

	This intrinsic, together with :ref:`deoptimization operand bundles
	<deopt_opbundles>`, allows frontends to express guards or checks on
	optimistic assumptions made during compilation. The semantics of
	``@llvm.experimental.guard`` is defined in terms of
	``@llvm.experimental.deoptimize`` -- its body is defined to be
	equivalent to:

	.. code-block:: text

	define void @llvm.experimental.guard(i1 %pred, <args...>) {
	%realPred = and i1 %pred, undef
	br i1 %realPred, label %continue, label %leave [, !make.implicit !{}]

	leave:
	call void @llvm.experimental.deoptimize(<args...>) [ "deopt"() ]
	ret void

	continue:
	ret void
	}


	with the optional ``[, !make.implicit !{}]`` present if and only if it
	is present on the call site. For more details on ``!make.implicit``,
	see :doc:`FaultMaps`.

	In words, ``@llvm.experimental.guard`` executes the attached
	``"deopt"`` continuation if (but not only if) its first argument
	is ``false``. Since the optimizer is allowed to replace the ``undef``
	with an arbitrary value, it can optimize guard to fail "spuriously",
	i.e. without the original condition being false (hence the "not only
	if"); and this allows for "check widening" type optimizations.

	``@llvm.experimental.guard`` cannot be invoked.


	'``llvm.load.relative``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	::

	declare i8* @llvm.load.relative.iN(i8* %ptr, iN %offset) argmemonly nounwind readonly

	Overview:
	"""""""""

	This intrinsic loads a 32-bit value from the address ``%ptr + %offset``,
	adds ``%ptr`` to that value and returns it. The constant folder specifically
	recognizes the form of this intrinsic and the constant initializers it may
	load from; if a loaded constant initializer is known to have the form
	``i32 trunc(x - %ptr)``, the intrinsic call is folded to ``x``.

	LLVM provides that the calculation of such a constant initializer will
	not overflow at link time under the medium code model if ``x`` is an
	``unnamed_addr`` function. However, it does not provide this guarantee for
	a constant initializer folded into a function body. This intrinsic can be
	used to avoid the possibility of overflows when loading from such a constant.

	Stack Map Intrinsics
	--------------------

	LLVM provides experimental intrinsics to support runtime patching
	mechanisms commonly desired in dynamic language JITs. These intrinsics
	are described in :doc:`StackMaps`.

	Element Wise Atomic Memory Intrinsics
	-------------------------------------

	These intrinsics are similar to the standard library memory intrinsics except
	that they perform memory transfer as a sequence of atomic memory accesses.

	.. _int_memcpy_element_unordered_atomic:

	'``llvm.memcpy.element.unordered.atomic``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	This is an overloaded intrinsic. You can use ``llvm.memcpy.element.unordered.atomic`` on
	any integer bit width and for different address spaces. Not all targets
	support all bit widths however.

	::

	declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* <dest>,
	i8* <src>,
	i32 <len>,
	i32 <element_size>)
	declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* <dest>,
	i8* <src>,
	i64 <len>,
	i32 <element_size>)

	Overview:
	"""""""""

	The '``llvm.memcpy.element.unordered.atomic.*``' intrinsic is a specialization of the
	'``llvm.memcpy.*``' intrinsic. It differs in that the ``dest`` and ``src`` are treated
	as arrays with elements that are exactly ``element_size`` bytes, and the copy between
	buffers uses a sequence of :ref:`unordered atomic <ordering>` load/store operations
	that are a positive integer multiple of the ``element_size`` in size.

	Arguments:
	""""""""""

	The first three arguments are the same as they are in the :ref:`@llvm.memcpy <int_memcpy>`
	intrinsic, with the added constraint that ``len`` is required to be a positive integer
	multiple of the ``element_size``. If ``len`` is not a positive integer multiple of
	``element_size``, then the behaviour of the intrinsic is undefined.

	``element_size`` must be a compile-time constant positive power of two no greater than
	target-specific atomic access size limit.

	For each of the input pointers ``align`` parameter attribute must be specified. It
	must be a power of two no less than the ``element_size``. Caller guarantees that
	both the source and destination pointers are aligned to that boundary.

	Semantics:
	""""""""""

	The '``llvm.memcpy.element.unordered.atomic.*``' intrinsic copies ``len`` bytes of
	memory from the source location to the destination location. These locations are not
	allowed to overlap. The memory copy is performed as a sequence of load/store operations
	where each access is guaranteed to be a multiple of ``element_size`` bytes wide and
	aligned at an ``element_size`` boundary.

	The order of the copy is unspecified. The same value may be read from the source
	buffer many times, but only one write is issued to the destination buffer per
	element. It is well defined to have concurrent reads and writes to both source and
	destination provided those reads and writes are unordered atomic when specified.

	This intrinsic does not provide any additional ordering guarantees over those
	provided by a set of unordered loads from the source location and stores to the
	destination.

	Lowering:
	"""""""""

	In the most general case call to the '``llvm.memcpy.element.unordered.atomic.*``' is
	lowered to a call to the symbol ``__llvm_memcpy_element_unordered_atomic_``. Where ''
	is replaced with an actual element size.

	Optimizer is allowed to inline memory copy when it's profitable to do so.

	'``llvm.memmove.element.unordered.atomic``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	This is an overloaded intrinsic. You can use
	``llvm.memmove.element.unordered.atomic`` on any integer bit width and for
	different address spaces. Not all targets support all bit widths however.

	::

	declare void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* <dest>,
	i8* <src>,
	i32 <len>,
	i32 <element_size>)
	declare void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* <dest>,
	i8* <src>,
	i64 <len>,
	i32 <element_size>)

	Overview:
	"""""""""

	The '``llvm.memmove.element.unordered.atomic.*``' intrinsic is a specialization
	of the '``llvm.memmove.*``' intrinsic. It differs in that the ``dest`` and
	``src`` are treated as arrays with elements that are exactly ``element_size``
	bytes, and the copy between buffers uses a sequence of
	:ref:`unordered atomic <ordering>` load/store operations that are a positive
	integer multiple of the ``element_size`` in size.

	Arguments:
	""""""""""

	The first three arguments are the same as they are in the
	:ref:`@llvm.memmove <int_memmove>` intrinsic, with the added constraint that
	``len`` is required to be a positive integer multiple of the ``element_size``.
	If ``len`` is not a positive integer multiple of ``element_size``, then the
	behaviour of the intrinsic is undefined.

	``element_size`` must be a compile-time constant positive power of two no
	greater than a target-specific atomic access size limit.

	For each of the input pointers the ``align`` parameter attribute must be
	specified. It must be a power of two no less than the ``element_size``. Caller
	guarantees that both the source and destination pointers are aligned to that
	boundary.

	Semantics:
	""""""""""

	The '``llvm.memmove.element.unordered.atomic.*``' intrinsic copies ``len`` bytes
	of memory from the source location to the destination location. These locations
	are allowed to overlap. The memory copy is performed as a sequence of load/store
	operations where each access is guaranteed to be a multiple of ``element_size``
	bytes wide and aligned at an ``element_size`` boundary.

	The order of the copy is unspecified. The same value may be read from the source
	buffer many times, but only one write is issued to the destination buffer per
	element. It is well defined to have concurrent reads and writes to both source
	and destination provided those reads and writes are unordered atomic when
	specified.

	This intrinsic does not provide any additional ordering guarantees over those
	provided by a set of unordered loads from the source location and stores to the
	destination.

	Lowering:
	"""""""""

	In the most general case call to the
	'``llvm.memmove.element.unordered.atomic.*``' is lowered to a call to the symbol
	``__llvm_memmove_element_unordered_atomic_``. Where '' is replaced with an
	actual element size.

	The optimizer is allowed to inline the memory copy when it's profitable to do so.

	.. _int_memset_element_unordered_atomic:

	'``llvm.memset.element.unordered.atomic``' Intrinsic
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	Syntax:
	"""""""

	This is an overloaded intrinsic. You can use ``llvm.memset.element.unordered.atomic`` on
	any integer bit width and for different address spaces. Not all targets
	support all bit widths however.

	::

	declare void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* <dest>,
	i8 <value>,
	i32 <len>,
	i32 <element_size>)
	declare void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* <dest>,
	i8 <value>,
	i64 <len>,
	i32 <element_size>)

	Overview:
	"""""""""

	The '``llvm.memset.element.unordered.atomic.*``' intrinsic is a specialization of the
	'``llvm.memset.*``' intrinsic. It differs in that the ``dest`` is treated as an array
	with elements that are exactly ``element_size`` bytes, and the assignment to that array
	uses uses a sequence of :ref:`unordered atomic <ordering>` store operations
	that are a positive integer multiple of the ``element_size`` in size.

	Arguments:
	""""""""""

	The first three arguments are the same as they are in the :ref:`@llvm.memset <int_memset>`
	intrinsic, with the added constraint that ``len`` is required to be a positive integer
	multiple of the ``element_size``. If ``len`` is not a positive integer multiple of
	``element_size``, then the behaviour of the intrinsic is undefined.

	``element_size`` must be a compile-time constant positive power of two no greater than
	target-specific atomic access size limit.

	The ``dest`` input pointer must have the ``align`` parameter attribute specified. It
	must be a power of two no less than the ``element_size``. Caller guarantees that
	the destination pointer is aligned to that boundary.

	Semantics:
	""""""""""

	The '``llvm.memset.element.unordered.atomic.*``' intrinsic sets the ``len`` bytes of
	memory starting at the destination location to the given ``value``. The memory is
	set with a sequence of store operations where each access is guaranteed to be a
	multiple of ``element_size`` bytes wide and aligned at an ``element_size`` boundary.

	The order of the assignment is unspecified. Only one write is issued to the
	destination buffer per element. It is well defined to have concurrent reads and
	writes to the destination provided those reads and writes are unordered atomic
	when specified.

	This intrinsic does not provide any additional ordering guarantees over those
	provided by a set of unordered stores to the destination.

	Lowering:
	"""""""""

	In the most general case call to the '``llvm.memset.element.unordered.atomic.*``' is
	lowered to a call to the symbol ``__llvm_memset_element_unordered_atomic_``. Where ''
	is replaced with an actual element size.

	The optimizer is allowed to inline the memory assignment when it's profitable to do so.

	diff --git a/docs/ReleaseNotes.rst b/docs/ReleaseNotes.rst
	index dcd2ec7eb22b..48af491f1214 100644
	--- a/docs/ReleaseNotes.rst
	+++ b/docs/ReleaseNotes.rst
	@@ -1,177 +1,211 @@
	========================
	LLVM 5.0.0 Release Notes
	========================

	.. contents::
	:local:

	.. warning::
	These are in-progress notes for the upcoming LLVM 5 release.
	Release notes for previous releases can be found on
	`the Download Page <http://releases.llvm.org/download.html>`_.


	Introduction
	============

	This document contains the release notes for the LLVM Compiler Infrastructure,
	release 5.0.0. Here we describe the status of LLVM, including major improvements
	from the previous release, improvements in various subprojects of LLVM, and
	some of the current users of the code. All LLVM releases may be downloaded
	from the `LLVM releases web site <http://llvm.org/releases/>`_.

	For more information about LLVM, including information about the latest
	release, please check out the `main LLVM web site <http://llvm.org/>`_. If you
	have questions or comments, the `LLVM Developer's Mailing List
	<http://lists.llvm.org/mailman/listinfo/llvm-dev>`_ is a good place to send
	them.

	Note that if you are reading this file from a Subversion checkout or the main
	LLVM web page, this document applies to the next release, not the current
	one. To see the release notes for a specific release, please see the `releases
	page <http://llvm.org/releases/>`_.

	Non-comprehensive list of changes in this release
	=================================================
	.. NOTE
	For small 1-3 sentence descriptions, just add an entry at the end of
	this list. If your description won't fit comfortably in one bullet
	point (e.g. maybe you would like to give an example of the
	functionality, or simply have a lot to talk about), see the `NOTE` below
	for adding a new subsection.

	* LLVM's ``WeakVH`` has been renamed to ``WeakTrackingVH`` and a new ``WeakVH``
	has been introduced. The new ``WeakVH`` nulls itself out on deletion, but
	does not track values across RAUW.

	* A new library named ``BinaryFormat`` has been created which holds a collection
	of code which previously lived in ``Support``. This includes the
	``file_magic`` structure and ``identify_magic`` functions, as well as all the
	structure and type definitions for DWARF, ELF, COFF, WASM, and MachO file
	formats.

	* The tool ``llvm-pdbdump`` has been renamed ``llvm-pdbutil`` to better reflect
	its nature as a general purpose PDB manipulation / diagnostics tool that does
	more than just dumping contents.

	* The ``BBVectorize`` pass has been removed. It was fully replaced and no
	longer used back in 2014 but we didn't get around to removing it. Now it is
	gone. The SLP vectorizer is the suggested non-loop vectorization pass.

	.. NOTE
	If you would like to document a larger change, then you can add a
	subsection about it right here. You can copy the following boilerplate
	and un-indent it (the indentation causes it to be inside this comment).

	Special New Feature
	-------------------

	Makes programs 10x faster by doing Special New Thing.

	Changes to the LLVM IR
	----------------------

	* The datalayout string may now indicate an address space to use for
	the pointer type of alloca rather than the default of 0.

	* Added speculatable attribute indicating a function which does has no
	side-effects which could inhibit hoisting of calls.

	Changes to the ARM Backend
	--------------------------

	During this release ...


	Changes to the MIPS Target
	--------------------------

	During this release ...


	Changes to the PowerPC Target
	-----------------------------

	During this release ...

	Changes to the X86 Target
	-------------------------

	* Added initial AMD Ryzen (znver1) scheduler support.

	* Added support for Intel Goldmont CPUs.

	* Add support for avx512vpopcntdq instructions.

	* Added heuristics to convert CMOV into branches when it may be profitable.

	* More aggressive inlining of memcmp calls.

	* Improve vXi64 shuffles on 32-bit targets.

	* Improved use of PMOVMSKB for any_of/all_of comparision reductions.

	* Improved Silvermont, Sandybridge, and Jaguar (btver2) schedulers.

	* Improved support for AVX512 vector rotations.

	* Added support for AMD Lightweight Profiling (LWP) instructions.

	+* Avoid using slow LEA instructions.
	+
	+* Use alternative sequences for multiply by constant.
	+
	+* Improved lowering of strided shuffles.
	+
	+* Improved the AVX512 cost model used by the vectorizer.
	+
	+* Fix scalar code performance when AVX512 is enabled by making i1's illegal.
	+
	+* Fixed many inline assembly bugs.
	+
	Changes to the AMDGPU Target
	-----------------------------

	* Initial gfx9 support

	Changes to the AVR Target
	-----------------------------

	This release consists mainly of bugfixes and implementations of features
	required for compiling basic Rust programs.

	* Enable the branch relaxation pass so that we don't crash on large
	stack load/stores

	* Add support for lowering bit-rotations to the native `ror` and `rol`
	instructions

	* Fix bug where function pointers were treated as pointers to RAM and not
	pointers to program memory

	* Fix broken code generaton for shift-by-variable expressions

	* Support zero-sized types in argument lists; this is impossible in C,
	but possible in Rust

	Changes to the OCaml bindings
	-----------------------------

	During this release ...


	Changes to the C API
	--------------------

	* Deprecated the ``LLVMAddBBVectorizePass`` interface since the ``BBVectorize``
	pass has been removed. It is now a no-op and will be removed in the next
	release. Use ``LLVMAddSLPVectorizePass`` instead to get the supported SLP
	vectorizer.


	External Open Source Projects Using LLVM 5
	==========================================

	-* A project...
	+Zig Programming Language
	+------------------------
	+
	+`Zig <http://ziglang.org>`_ is an open-source programming language designed
	+for robustness, optimality, and clarity. It integrates closely with C and is
	+intended to eventually take the place of C. It uses LLVM to produce highly
	+optimized native code and to cross-compile for any target out of the box. Zig
	+is in alpha; with a beta release expected in September.
	+
	+LDC - the LLVM-based D compiler
	+-------------------------------
	+
	+`D <http://dlang.org>`_ is a language with C-like syntax and static typing. It
	+pragmatically combines efficiency, control, and modeling power, with safety and
	+programmer productivity. D supports powerful concepts like Compile-Time Function
	+Execution (CTFE) and Template Meta-Programming, provides an innovative approach
	+to concurrency and offers many classical paradigms.
	+
	+`LDC <http://wiki.dlang.org/LDC>`_ uses the frontend from the reference compiler
	+combined with LLVM as backend to produce efficient native code. LDC targets
	+x86/x86_64 systems like Linux, OS X, FreeBSD and Windows and also Linux on ARM
	+and PowerPC (32/64 bit). Ports to other architectures like AArch64 and MIPS64
	+are underway.


	Additional Information
	======================

	A wide variety of additional information is available on the `LLVM web page
	<http://llvm.org/>`_, in particular in the `documentation
	<http://llvm.org/docs/>`_ section. The web page also contains versions of the
	API documentation which is up-to-date with the Subversion version of the source
	code. You can access versions of these documents specific to this release by
	going into the ``llvm/docs/`` directory in the LLVM tree.

	If you have any questions or comments about LLVM, please feel free to contact
	us via the `mailing lists <http://llvm.org/docs/#maillist>`_.
	diff --git a/include/llvm/CodeGen/SelectionDAG.h b/include/llvm/CodeGen/SelectionDAG.h
	index 55a23c3cca9b..d6851f7143a5 100644
	--- a/include/llvm/CodeGen/SelectionDAG.h
	+++ b/include/llvm/CodeGen/SelectionDAG.h
	@@ -1,1550 +1,1551 @@
	//===- llvm/CodeGen/SelectionDAG.h - InstSelection DAG ----------- C++ --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file declares the SelectionDAG class, and transitively defines the
	// SDNode class and subclasses.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_CODEGEN_SELECTIONDAG_H
	#define LLVM_CODEGEN_SELECTIONDAG_H

	#include "llvm/ADT/APFloat.h"
	#include "llvm/ADT/APInt.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/DenseSet.h"
	#include "llvm/ADT/FoldingSet.h"
	#include "llvm/ADT/SetVector.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/StringMap.h"
	#include "llvm/ADT/ilist.h"
	#include "llvm/ADT/iterator.h"
	#include "llvm/ADT/iterator_range.h"
	#include "llvm/Analysis/AliasAnalysis.h"
	#include "llvm/CodeGen/DAGCombine.h"
	#include "llvm/CodeGen/ISDOpcodes.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineMemOperand.h"
	#include "llvm/CodeGen/MachineValueType.h"
	#include "llvm/CodeGen/SelectionDAGNodes.h"
	#include "llvm/CodeGen/ValueTypes.h"
	#include "llvm/IR/DebugLoc.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/Metadata.h"
	#include "llvm/Support/Allocator.h"
	#include "llvm/Support/ArrayRecycler.h"
	#include "llvm/Support/AtomicOrdering.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/CodeGen.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/RecyclingAllocator.h"
	#include <algorithm>
	#include <cassert>
	#include <cstdint>
	#include <functional>
	#include <map>
	#include <string>
	#include <tuple>
	#include <utility>
	#include <vector>

	namespace llvm {

	class BlockAddress;
	class Constant;
	class ConstantFP;
	class ConstantInt;
	class DataLayout;
	struct fltSemantics;
	class GlobalValue;
	struct KnownBits;
	class LLVMContext;
	class MachineBasicBlock;
	class MachineConstantPoolValue;
	class MCSymbol;
	class OptimizationRemarkEmitter;
	class SDDbgValue;
	class SelectionDAG;
	class SelectionDAGTargetInfo;
	class TargetLowering;
	class TargetMachine;
	class TargetSubtargetInfo;
	class Value;

	class SDVTListNode : public FoldingSetNode {
	friend struct FoldingSetTrait<SDVTListNode>;

	/// A reference to an Interned FoldingSetNodeID for this node.
	/// The Allocator in SelectionDAG holds the data.
	/// SDVTList contains all types which are frequently accessed in SelectionDAG.
	/// The size of this list is not expected to be big so it won't introduce
	/// a memory penalty.
	FoldingSetNodeIDRef FastID;
	const EVT *VTs;
	unsigned int NumVTs;
	/// The hash value for SDVTList is fixed, so cache it to avoid
	/// hash calculation.
	unsigned HashValue;

	public:
	SDVTListNode(const FoldingSetNodeIDRef ID, const EVT *VT, unsigned int Num) :
	FastID(ID), VTs(VT), NumVTs(Num) {
	HashValue = ID.ComputeHash();
	}

	SDVTList getSDVTList() {
	SDVTList result = {VTs, NumVTs};
	return result;
	}
	};

	/// Specialize FoldingSetTrait for SDVTListNode
	/// to avoid computing temp FoldingSetNodeID and hash value.
	template<> struct FoldingSetTrait<SDVTListNode> : DefaultFoldingSetTrait<SDVTListNode> {
	static void Profile(const SDVTListNode &X, FoldingSetNodeID& ID) {
	ID = X.FastID;
	}

	static bool Equals(const SDVTListNode &X, const FoldingSetNodeID &ID,
	unsigned IDHash, FoldingSetNodeID &TempID) {
	if (X.HashValue != IDHash)
	return false;
	return ID == X.FastID;
	}

	static unsigned ComputeHash(const SDVTListNode &X, FoldingSetNodeID &TempID) {
	return X.HashValue;
	}
	};

	template <> struct ilist_alloc_traits<SDNode> {
	static void deleteNode(SDNode *) {
	llvm_unreachable("ilist_traits<SDNode> shouldn't see a deleteNode call!");
	}
	};

	/// Keeps track of dbg_value information through SDISel. We do
	/// not build SDNodes for these so as not to perturb the generated code;
	/// instead the info is kept off to the side in this structure. Each SDNode may
	/// have one or more associated dbg_value entries. This information is kept in
	/// DbgValMap.
	/// Byval parameters are handled separately because they don't use alloca's,
	/// which busts the normal mechanism. There is good reason for handling all
	/// parameters separately: they may not have code generated for them, they
	/// should always go at the beginning of the function regardless of other code
	/// motion, and debug info for them is potentially useful even if the parameter
	/// is unused. Right now only byval parameters are handled separately.
	class SDDbgInfo {
	BumpPtrAllocator Alloc;
	SmallVector<SDDbgValue*, 32> DbgValues;
	SmallVector<SDDbgValue*, 32> ByvalParmDbgValues;
	using DbgValMapType = DenseMap<const SDNode , SmallVector<SDDbgValue , 2>>;
	DbgValMapType DbgValMap;

	public:
	SDDbgInfo() = default;
	SDDbgInfo(const SDDbgInfo &) = delete;
	SDDbgInfo &operator=(const SDDbgInfo &) = delete;

	void add(SDDbgValue V, const SDNode Node, bool isParameter) {
	if (isParameter) {
	ByvalParmDbgValues.push_back(V);
	} else DbgValues.push_back(V);
	if (Node)
	DbgValMap[Node].push_back(V);
	}

	/// \brief Invalidate all DbgValues attached to the node and remove
	/// it from the Node-to-DbgValues map.
	void erase(const SDNode *Node);

	void clear() {
	DbgValMap.clear();
	DbgValues.clear();
	ByvalParmDbgValues.clear();
	Alloc.Reset();
	}

	BumpPtrAllocator &getAlloc() { return Alloc; }

	bool empty() const {
	return DbgValues.empty() && ByvalParmDbgValues.empty();
	}

	ArrayRef<SDDbgValue> getSDDbgValues(const SDNode Node) {
	DbgValMapType::iterator I = DbgValMap.find(Node);
	if (I != DbgValMap.end())
	return I->second;
	return ArrayRef<SDDbgValue*>();
	}

	using DbgIterator = SmallVectorImpl<SDDbgValue*>::iterator;

	DbgIterator DbgBegin() { return DbgValues.begin(); }
	DbgIterator DbgEnd() { return DbgValues.end(); }
	DbgIterator ByvalParmDbgBegin() { return ByvalParmDbgValues.begin(); }
	DbgIterator ByvalParmDbgEnd() { return ByvalParmDbgValues.end(); }
	};

	void checkForCycles(const SelectionDAG *DAG, bool force = false);

	/// This is used to represent a portion of an LLVM function in a low-level
	/// Data Dependence DAG representation suitable for instruction selection.
	/// This DAG is constructed as the first step of instruction selection in order
	/// to allow implementation of machine specific optimizations
	/// and code simplifications.
	///
	/// The representation used by the SelectionDAG is a target-independent
	/// representation, which has some similarities to the GCC RTL representation,
	/// but is significantly more simple, powerful, and is a graph form instead of a
	/// linear form.
	///
	class SelectionDAG {
	const TargetMachine &TM;
	const SelectionDAGTargetInfo *TSI = nullptr;
	const TargetLowering *TLI = nullptr;
	MachineFunction *MF;
	LLVMContext *Context;
	CodeGenOpt::Level OptLevel;

	/// The function-level optimization remark emitter. Used to emit remarks
	/// whenever manipulating the DAG.
	OptimizationRemarkEmitter *ORE;

	/// The starting token.
	SDNode EntryNode;

	/// The root of the entire DAG.
	SDValue Root;

	/// A linked list of nodes in the current DAG.
	ilist<SDNode> AllNodes;

	/// The AllocatorType for allocating SDNodes. We use
	/// pool allocation with recycling.
	using NodeAllocatorType = RecyclingAllocator<BumpPtrAllocator, SDNode,
	sizeof(LargestSDNode),
	alignof(MostAlignedSDNode)>;

	/// Pool allocation for nodes.
	NodeAllocatorType NodeAllocator;

	/// This structure is used to memoize nodes, automatically performing
	/// CSE with existing nodes when a duplicate is requested.
	FoldingSet<SDNode> CSEMap;

	/// Pool allocation for machine-opcode SDNode operands.
	BumpPtrAllocator OperandAllocator;
	ArrayRecycler<SDUse> OperandRecycler;

	/// Pool allocation for misc. objects that are created once per SelectionDAG.
	BumpPtrAllocator Allocator;

	/// Tracks dbg_value information through SDISel.
	SDDbgInfo *DbgInfo;

	uint16_t NextPersistentId = 0;

	public:
	/// Clients of various APIs that cause global effects on
	/// the DAG can optionally implement this interface. This allows the clients
	/// to handle the various sorts of updates that happen.
	///
	/// A DAGUpdateListener automatically registers itself with DAG when it is
	/// constructed, and removes itself when destroyed in RAII fashion.
	struct DAGUpdateListener {
	DAGUpdateListener *const Next;
	SelectionDAG &DAG;

	explicit DAGUpdateListener(SelectionDAG &D)
	: Next(D.UpdateListeners), DAG(D) {
	DAG.UpdateListeners = this;
	}

	virtual ~DAGUpdateListener() {
	assert(DAG.UpdateListeners == this &&
	"DAGUpdateListeners must be destroyed in LIFO order");
	DAG.UpdateListeners = Next;
	}

	/// The node N that was deleted and, if E is not null, an
	/// equivalent node E that replaced it.
	virtual void NodeDeleted(SDNode N, SDNode E);

	/// The node N that was updated.
	virtual void NodeUpdated(SDNode *N);
	};

	struct DAGNodeDeletedListener : public DAGUpdateListener {
	std::function<void(SDNode , SDNode )> Callback;

	DAGNodeDeletedListener(SelectionDAG &DAG,
	std::function<void(SDNode , SDNode )> Callback)
	: DAGUpdateListener(DAG), Callback(std::move(Callback)) {}

	void NodeDeleted(SDNode N, SDNode E) override { Callback(N, E); }
	};

	/// When true, additional steps are taken to
	/// ensure that getConstant() and similar functions return DAG nodes that
	/// have legal types. This is important after type legalization since
	/// any illegally typed nodes generated after this point will not experience
	/// type legalization.
	bool NewNodesMustHaveLegalTypes = false;

	private:
	/// DAGUpdateListener is a friend so it can manipulate the listener stack.
	friend struct DAGUpdateListener;

	/// Linked list of registered DAGUpdateListener instances.
	/// This stack is maintained by DAGUpdateListener RAII.
	DAGUpdateListener *UpdateListeners = nullptr;

	/// Implementation of setSubgraphColor.
	/// Return whether we had to truncate the search.
	bool setSubgraphColorHelper(SDNode N, const char Color,
	DenseSet<SDNode *> &visited,
	int level, bool &printed);

	template <typename SDNodeT, typename... ArgTypes>
	SDNodeT *newSDNode(ArgTypes &&... Args) {
	return new (NodeAllocator.template Allocate<SDNodeT>())
	SDNodeT(std::forward<ArgTypes>(Args)...);
	}

	/// Build a synthetic SDNodeT with the given args and extract its subclass
	/// data as an integer (e.g. for use in a folding set).
	///
	/// The args to this function are the same as the args to SDNodeT's
	/// constructor, except the second arg (assumed to be a const DebugLoc&) is
	/// omitted.
	template <typename SDNodeT, typename... ArgTypes>
	static uint16_t getSyntheticNodeSubclassData(unsigned IROrder,
	ArgTypes &&... Args) {
	// The compiler can reduce this expression to a constant iff we pass an
	// empty DebugLoc. Thankfully, the debug location doesn't have any bearing
	// on the subclass data.
	return SDNodeT(IROrder, DebugLoc(), std::forward<ArgTypes>(Args)...)
	.getRawSubclassData();
	}

	void createOperands(SDNode *Node, ArrayRef<SDValue> Vals) {
	assert(!Node->OperandList && "Node already has operands");
	SDUse *Ops = OperandRecycler.allocate(
	ArrayRecycler<SDUse>::Capacity::get(Vals.size()), OperandAllocator);

	for (unsigned I = 0; I != Vals.size(); ++I) {
	Ops[I].setUser(Node);
	Ops[I].setInitial(Vals[I]);
	}
	Node->NumOperands = Vals.size();
	Node->OperandList = Ops;
	checkForCycles(Node);
	}

	void removeOperands(SDNode *Node) {
	if (!Node->OperandList)
	return;
	OperandRecycler.deallocate(
	ArrayRecycler<SDUse>::Capacity::get(Node->NumOperands),
	Node->OperandList);
	Node->NumOperands = 0;
	Node->OperandList = nullptr;
	}

	public:
	explicit SelectionDAG(const TargetMachine &TM, CodeGenOpt::Level);
	SelectionDAG(const SelectionDAG &) = delete;
	SelectionDAG &operator=(const SelectionDAG &) = delete;
	~SelectionDAG();

	/// Prepare this SelectionDAG to process code in the given MachineFunction.
	void init(MachineFunction &NewMF, OptimizationRemarkEmitter &NewORE);

	/// Clear state and free memory necessary to make this
	/// SelectionDAG ready to process a new block.
	void clear();

	MachineFunction &getMachineFunction() const { return *MF; }
	const DataLayout &getDataLayout() const { return MF->getDataLayout(); }
	const TargetMachine &getTarget() const { return TM; }
	const TargetSubtargetInfo &getSubtarget() const { return MF->getSubtarget(); }
	const TargetLowering &getTargetLoweringInfo() const { return *TLI; }
	const SelectionDAGTargetInfo &getSelectionDAGInfo() const { return *TSI; }
	LLVMContext *getContext() const {return Context; }
	OptimizationRemarkEmitter &getORE() const { return *ORE; }

	/// Pop up a GraphViz/gv window with the DAG rendered using 'dot'.
	void viewGraph(const std::string &Title);
	void viewGraph();

	#ifndef NDEBUG
	std::map<const SDNode *, std::string> NodeGraphAttrs;
	#endif

	/// Clear all previously defined node graph attributes.
	/// Intended to be used from a debugging tool (eg. gdb).
	void clearGraphAttrs();

	/// Set graph attributes for a node. (eg. "color=red".)
	void setGraphAttrs(const SDNode N, const char Attrs);

	/// Get graph attributes for a node. (eg. "color=red".)
	/// Used from getNodeAttributes.
	const std::string getGraphAttrs(const SDNode *N) const;

	/// Convenience for setting node color attribute.
	void setGraphColor(const SDNode N, const char Color);

	/// Convenience for setting subgraph color attribute.
	void setSubgraphColor(SDNode N, const char Color);

	using allnodes_const_iterator = ilist<SDNode>::const_iterator;

	allnodes_const_iterator allnodes_begin() const { return AllNodes.begin(); }
	allnodes_const_iterator allnodes_end() const { return AllNodes.end(); }

	using allnodes_iterator = ilist<SDNode>::iterator;

	allnodes_iterator allnodes_begin() { return AllNodes.begin(); }
	allnodes_iterator allnodes_end() { return AllNodes.end(); }

	ilist<SDNode>::size_type allnodes_size() const {
	return AllNodes.size();
	}

	iterator_range<allnodes_iterator> allnodes() {
	return make_range(allnodes_begin(), allnodes_end());
	}
	iterator_range<allnodes_const_iterator> allnodes() const {
	return make_range(allnodes_begin(), allnodes_end());
	}

	/// Return the root tag of the SelectionDAG.
	const SDValue &getRoot() const { return Root; }

	/// Return the token chain corresponding to the entry of the function.
	SDValue getEntryNode() const {
	return SDValue(const_cast<SDNode *>(&EntryNode), 0);
	}

	/// Set the current root tag of the SelectionDAG.
	///
	const SDValue &setRoot(SDValue N) {
	assert((!N.getNode() \|\| N.getValueType() == MVT::Other) &&
	"DAG root value is not a chain!");
	if (N.getNode())
	checkForCycles(N.getNode(), this);
	Root = N;
	if (N.getNode())
	checkForCycles(this);
	return Root;
	}

	/// This iterates over the nodes in the SelectionDAG, folding
	/// certain types of nodes together, or eliminating superfluous nodes. The
	/// Level argument controls whether Combine is allowed to produce nodes and
	/// types that are illegal on the target.
	void Combine(CombineLevel Level, AliasAnalysis *AA,
	CodeGenOpt::Level OptLevel);

	/// This transforms the SelectionDAG into a SelectionDAG that
	/// only uses types natively supported by the target.
	/// Returns "true" if it made any changes.
	///
	/// Note that this is an involved process that may invalidate pointers into
	/// the graph.
	bool LegalizeTypes();

	/// This transforms the SelectionDAG into a SelectionDAG that is
	/// compatible with the target instruction selector, as indicated by the
	/// TargetLowering object.
	///
	/// Note that this is an involved process that may invalidate pointers into
	/// the graph.
	void Legalize();

	/// \brief Transforms a SelectionDAG node and any operands to it into a node
	/// that is compatible with the target instruction selector, as indicated by
	/// the TargetLowering object.
	///
	/// \returns true if \c N is a valid, legal node after calling this.
	///
	/// This essentially runs a single recursive walk of the \c Legalize process
	/// over the given node (and its operands). This can be used to incrementally
	/// legalize the DAG. All of the nodes which are directly replaced,
	/// potentially including N, are added to the output parameter \c
	/// UpdatedNodes so that the delta to the DAG can be understood by the
	/// caller.
	///
	/// When this returns false, N has been legalized in a way that make the
	/// pointer passed in no longer valid. It may have even been deleted from the
	/// DAG, and so it shouldn't be used further. When this returns true, the
	/// N passed in is a legal node, and can be immediately processed as such.
	/// This may still have done some work on the DAG, and will still populate
	/// UpdatedNodes with any new nodes replacing those originally in the DAG.
	bool LegalizeOp(SDNode N, SmallSetVector<SDNode , 16> &UpdatedNodes);

	/// This transforms the SelectionDAG into a SelectionDAG
	/// that only uses vector math operations supported by the target. This is
	/// necessary as a separate step from Legalize because unrolling a vector
	/// operation can introduce illegal types, which requires running
	/// LegalizeTypes again.
	///
	/// This returns true if it made any changes; in that case, LegalizeTypes
	/// is called again before Legalize.
	///
	/// Note that this is an involved process that may invalidate pointers into
	/// the graph.
	bool LegalizeVectors();

	/// This method deletes all unreachable nodes in the SelectionDAG.
	void RemoveDeadNodes();

	/// Remove the specified node from the system. This node must
	/// have no referrers.
	void DeleteNode(SDNode *N);

	/// Return an SDVTList that represents the list of values specified.
	SDVTList getVTList(EVT VT);
	SDVTList getVTList(EVT VT1, EVT VT2);
	SDVTList getVTList(EVT VT1, EVT VT2, EVT VT3);
	SDVTList getVTList(EVT VT1, EVT VT2, EVT VT3, EVT VT4);
	SDVTList getVTList(ArrayRef<EVT> VTs);

	//===--------------------------------------------------------------------===//
	// Node creation methods.

	/// \brief Create a ConstantSDNode wrapping a constant value.
	/// If VT is a vector type, the constant is splatted into a BUILD_VECTOR.
	///
	/// If only legal types can be produced, this does the necessary
	/// transformations (e.g., if the vector element type is illegal).
	/// @{
	SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT,
	bool isTarget = false, bool isOpaque = false);
	SDValue getConstant(const APInt &Val, const SDLoc &DL, EVT VT,
	bool isTarget = false, bool isOpaque = false);

	SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget = false,
	bool IsOpaque = false) {
	return getConstant(APInt::getAllOnesValue(VT.getScalarSizeInBits()), DL,
	VT, IsTarget, IsOpaque);
	}

	SDValue getConstant(const ConstantInt &Val, const SDLoc &DL, EVT VT,
	bool isTarget = false, bool isOpaque = false);
	SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL,
	bool isTarget = false);
	SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT,
	bool isOpaque = false) {
	return getConstant(Val, DL, VT, true, isOpaque);
	}
	SDValue getTargetConstant(const APInt &Val, const SDLoc &DL, EVT VT,
	bool isOpaque = false) {
	return getConstant(Val, DL, VT, true, isOpaque);
	}
	SDValue getTargetConstant(const ConstantInt &Val, const SDLoc &DL, EVT VT,
	bool isOpaque = false) {
	return getConstant(Val, DL, VT, true, isOpaque);
	}
	/// @}

	/// \brief Create a ConstantFPSDNode wrapping a constant value.
	/// If VT is a vector type, the constant is splatted into a BUILD_VECTOR.
	///
	/// If only legal types can be produced, this does the necessary
	/// transformations (e.g., if the vector element type is illegal).
	/// The forms that take a double should only be used for simple constants
	/// that can be exactly represented in VT. No checks are made.
	/// @{
	SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT,
	bool isTarget = false);
	SDValue getConstantFP(const APFloat &Val, const SDLoc &DL, EVT VT,
	bool isTarget = false);
	SDValue getConstantFP(const ConstantFP &CF, const SDLoc &DL, EVT VT,
	bool isTarget = false);
	SDValue getTargetConstantFP(double Val, const SDLoc &DL, EVT VT) {
	return getConstantFP(Val, DL, VT, true);
	}
	SDValue getTargetConstantFP(const APFloat &Val, const SDLoc &DL, EVT VT) {
	return getConstantFP(Val, DL, VT, true);
	}
	SDValue getTargetConstantFP(const ConstantFP &Val, const SDLoc &DL, EVT VT) {
	return getConstantFP(Val, DL, VT, true);
	}
	/// @}

	SDValue getGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT,
	int64_t offset = 0, bool isTargetGA = false,
	unsigned char TargetFlags = 0);
	SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT,
	int64_t offset = 0,
	unsigned char TargetFlags = 0) {
	return getGlobalAddress(GV, DL, VT, offset, true, TargetFlags);
	}
	SDValue getFrameIndex(int FI, EVT VT, bool isTarget = false);
	SDValue getTargetFrameIndex(int FI, EVT VT) {
	return getFrameIndex(FI, VT, true);
	}
	SDValue getJumpTable(int JTI, EVT VT, bool isTarget = false,
	unsigned char TargetFlags = 0);
	SDValue getTargetJumpTable(int JTI, EVT VT, unsigned char TargetFlags = 0) {
	return getJumpTable(JTI, VT, true, TargetFlags);
	}
	SDValue getConstantPool(const Constant *C, EVT VT,
	unsigned Align = 0, int Offs = 0, bool isT=false,
	unsigned char TargetFlags = 0);
	SDValue getTargetConstantPool(const Constant *C, EVT VT,
	unsigned Align = 0, int Offset = 0,
	unsigned char TargetFlags = 0) {
	return getConstantPool(C, VT, Align, Offset, true, TargetFlags);
	}
	SDValue getConstantPool(MachineConstantPoolValue *C, EVT VT,
	unsigned Align = 0, int Offs = 0, bool isT=false,
	unsigned char TargetFlags = 0);
	SDValue getTargetConstantPool(MachineConstantPoolValue *C,
	EVT VT, unsigned Align = 0,
	int Offset = 0, unsigned char TargetFlags=0) {
	return getConstantPool(C, VT, Align, Offset, true, TargetFlags);
	}
	SDValue getTargetIndex(int Index, EVT VT, int64_t Offset = 0,
	unsigned char TargetFlags = 0);
	// When generating a branch to a BB, we don't in general know enough
	// to provide debug info for the BB at that time, so keep this one around.
	SDValue getBasicBlock(MachineBasicBlock *MBB);
	SDValue getBasicBlock(MachineBasicBlock *MBB, SDLoc dl);
	SDValue getExternalSymbol(const char *Sym, EVT VT);
	SDValue getExternalSymbol(const char *Sym, const SDLoc &dl, EVT VT);
	SDValue getTargetExternalSymbol(const char *Sym, EVT VT,
	unsigned char TargetFlags = 0);
	SDValue getMCSymbol(MCSymbol *Sym, EVT VT);

	SDValue getValueType(EVT);
	SDValue getRegister(unsigned Reg, EVT VT);
	SDValue getRegisterMask(const uint32_t *RegMask);
	SDValue getEHLabel(const SDLoc &dl, SDValue Root, MCSymbol *Label);
	SDValue getBlockAddress(const BlockAddress *BA, EVT VT,
	int64_t Offset = 0, bool isTarget = false,
	unsigned char TargetFlags = 0);
	SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT,
	int64_t Offset = 0,
	unsigned char TargetFlags = 0) {
	return getBlockAddress(BA, VT, Offset, true, TargetFlags);
	}

	SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg,
	SDValue N) {
	return getNode(ISD::CopyToReg, dl, MVT::Other, Chain,
	getRegister(Reg, N.getValueType()), N);
	}

	// This version of the getCopyToReg method takes an extra operand, which
	// indicates that there is potentially an incoming glue value (if Glue is not
	// null) and that there should be a glue result.
	SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N,
	SDValue Glue) {
	SDVTList VTs = getVTList(MVT::Other, MVT::Glue);
	SDValue Ops[] = { Chain, getRegister(Reg, N.getValueType()), N, Glue };
	return getNode(ISD::CopyToReg, dl, VTs,
	makeArrayRef(Ops, Glue.getNode() ? 4 : 3));
	}

	// Similar to last getCopyToReg() except parameter Reg is a SDValue
	SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, SDValue Reg, SDValue N,
	SDValue Glue) {
	SDVTList VTs = getVTList(MVT::Other, MVT::Glue);
	SDValue Ops[] = { Chain, Reg, N, Glue };
	return getNode(ISD::CopyToReg, dl, VTs,
	makeArrayRef(Ops, Glue.getNode() ? 4 : 3));
	}

	SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT) {
	SDVTList VTs = getVTList(VT, MVT::Other);
	SDValue Ops[] = { Chain, getRegister(Reg, VT) };
	return getNode(ISD::CopyFromReg, dl, VTs, Ops);
	}

	// This version of the getCopyFromReg method takes an extra operand, which
	// indicates that there is potentially an incoming glue value (if Glue is not
	// null) and that there should be a glue result.
	SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT,
	SDValue Glue) {
	SDVTList VTs = getVTList(VT, MVT::Other, MVT::Glue);
	SDValue Ops[] = { Chain, getRegister(Reg, VT), Glue };
	return getNode(ISD::CopyFromReg, dl, VTs,
	makeArrayRef(Ops, Glue.getNode() ? 3 : 2));
	}

	SDValue getCondCode(ISD::CondCode Cond);

	/// Return an ISD::VECTOR_SHUFFLE node. The number of elements in VT,
	/// which must be a vector type, must match the number of mask elements
	/// NumElts. An integer mask element equal to -1 is treated as undefined.
	SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2,
	ArrayRef<int> Mask);

	/// Return an ISD::BUILD_VECTOR node. The number of elements in VT,
	/// which must be a vector type, must match the number of operands in Ops.
	/// The operands must have the same type as (or, for integers, a type wider
	/// than) VT's element type.
	SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef<SDValue> Ops) {
	// VerifySDNode (via InsertNode) checks BUILD_VECTOR later.
	return getNode(ISD::BUILD_VECTOR, DL, VT, Ops);
	}

	/// Return an ISD::BUILD_VECTOR node. The number of elements in VT,
	/// which must be a vector type, must match the number of operands in Ops.
	/// The operands must have the same type as (or, for integers, a type wider
	/// than) VT's element type.
	SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef<SDUse> Ops) {
	// VerifySDNode (via InsertNode) checks BUILD_VECTOR later.
	return getNode(ISD::BUILD_VECTOR, DL, VT, Ops);
	}

	/// Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all
	/// elements. VT must be a vector type. Op's type must be the same as (or,
	/// for integers, a type wider than) VT's element type.
	SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op) {
	// VerifySDNode (via InsertNode) checks BUILD_VECTOR later.
	if (Op.getOpcode() == ISD::UNDEF) {
	assert((VT.getVectorElementType() == Op.getValueType() \|\|
	(VT.isInteger() &&
	VT.getVectorElementType().bitsLE(Op.getValueType()))) &&
	"A splatted value must have a width equal or (for integers) "
	"greater than the vector element type!");
	return getNode(ISD::UNDEF, SDLoc(), VT);
	}

	SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Op);
	return getNode(ISD::BUILD_VECTOR, DL, VT, Ops);
	}

	/// \brief Returns an ISD::VECTOR_SHUFFLE node semantically equivalent to
	/// the shuffle node in input but with swapped operands.
	///
	/// Example: shuffle A, B, <0,5,2,7> -> shuffle B, A, <4,1,6,3>
	SDValue getCommutedVectorShuffle(const ShuffleVectorSDNode &SV);

	/// Convert Op, which must be of float type, to the
	/// float type VT, by either extending or rounding (by truncation).
	SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT);

	/// Convert Op, which must be of integer type, to the
	/// integer type VT, by either any-extending or truncating it.
	SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT);

	/// Convert Op, which must be of integer type, to the
	/// integer type VT, by either sign-extending or truncating it.
	SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT);

	/// Convert Op, which must be of integer type, to the
	/// integer type VT, by either zero-extending or truncating it.
	SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT);

	/// Return the expression required to zero extend the Op
	/// value assuming it was the smaller SrcTy value.
	SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT SrcTy);

	/// Return an operation which will any-extend the low lanes of the operand
	/// into the specified vector type. For example,
	/// this can convert a v16i8 into a v4i32 by any-extending the low four
	/// lanes of the operand from i8 to i32.
	SDValue getAnyExtendVectorInReg(SDValue Op, const SDLoc &DL, EVT VT);

	/// Return an operation which will sign extend the low lanes of the operand
	/// into the specified vector type. For example,
	/// this can convert a v16i8 into a v4i32 by sign extending the low four
	/// lanes of the operand from i8 to i32.
	SDValue getSignExtendVectorInReg(SDValue Op, const SDLoc &DL, EVT VT);

	/// Return an operation which will zero extend the low lanes of the operand
	/// into the specified vector type. For example,
	/// this can convert a v16i8 into a v4i32 by zero extending the low four
	/// lanes of the operand from i8 to i32.
	SDValue getZeroExtendVectorInReg(SDValue Op, const SDLoc &DL, EVT VT);

	/// Convert Op, which must be of integer type, to the integer type VT,
	/// by using an extension appropriate for the target's
	/// BooleanContent for type OpVT or truncating it.
	SDValue getBoolExtOrTrunc(SDValue Op, const SDLoc &SL, EVT VT, EVT OpVT);

	/// Create a bitwise NOT operation as (XOR Val, -1).
	SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT);

	/// \brief Create a logical NOT operation as (XOR Val, BooleanOne).
	SDValue getLogicalNOT(const SDLoc &DL, SDValue Val, EVT VT);

	/// Return a new CALLSEQ_START node, that starts new call frame, in which
	/// InSize bytes are set up inside CALLSEQ_START..CALLSEQ_END sequence and
	/// OutSize specifies part of the frame set up prior to the sequence.
	SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize,
	const SDLoc &DL) {
	SDVTList VTs = getVTList(MVT::Other, MVT::Glue);
	SDValue Ops[] = { Chain,
	getIntPtrConstant(InSize, DL, true),
	getIntPtrConstant(OutSize, DL, true) };
	return getNode(ISD::CALLSEQ_START, DL, VTs, Ops);
	}

	/// Return a new CALLSEQ_END node, which always must have a
	/// glue result (to ensure it's not CSE'd).
	/// CALLSEQ_END does not have a useful SDLoc.
	SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2,
	SDValue InGlue, const SDLoc &DL) {
	SDVTList NodeTys = getVTList(MVT::Other, MVT::Glue);
	SmallVector<SDValue, 4> Ops;
	Ops.push_back(Chain);
	Ops.push_back(Op1);
	Ops.push_back(Op2);
	if (InGlue.getNode())
	Ops.push_back(InGlue);
	return getNode(ISD::CALLSEQ_END, DL, NodeTys, Ops);
	}

	/// Return true if the result of this operation is always undefined.
	bool isUndef(unsigned Opcode, ArrayRef<SDValue> Ops);

	/// Return an UNDEF node. UNDEF does not have a useful SDLoc.
	SDValue getUNDEF(EVT VT) {
	return getNode(ISD::UNDEF, SDLoc(), VT);
	}

	/// Return a GLOBAL_OFFSET_TABLE node. This does not have a useful SDLoc.
	SDValue getGLOBAL_OFFSET_TABLE(EVT VT) {
	return getNode(ISD::GLOBAL_OFFSET_TABLE, SDLoc(), VT);
	}

	/// Gets or creates the specified node.
	///
	SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
	ArrayRef<SDUse> Ops);
	SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
	ArrayRef<SDValue> Ops, const SDNodeFlags Flags = SDNodeFlags());
	SDValue getNode(unsigned Opcode, const SDLoc &DL, ArrayRef<EVT> ResultTys,
	ArrayRef<SDValue> Ops);
	SDValue getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTs,
	ArrayRef<SDValue> Ops);

	// Specialize based on number of operands.
	SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT);
	SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N,
	const SDNodeFlags Flags = SDNodeFlags());
	SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1,
	SDValue N2, const SDNodeFlags Flags = SDNodeFlags());
	SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1,
	SDValue N2, SDValue N3);
	SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1,
	SDValue N2, SDValue N3, SDValue N4);
	SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1,
	SDValue N2, SDValue N3, SDValue N4, SDValue N5);

	// Specialize again based on number of operands for nodes with a VTList
	// rather than a single VT.
	SDValue getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTs);
	SDValue getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTs, SDValue N);
	SDValue getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTs, SDValue N1,
	SDValue N2);
	SDValue getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTs, SDValue N1,
	SDValue N2, SDValue N3);
	SDValue getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTs, SDValue N1,
	SDValue N2, SDValue N3, SDValue N4);
	SDValue getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTs, SDValue N1,
	SDValue N2, SDValue N3, SDValue N4, SDValue N5);

	/// Compute a TokenFactor to force all the incoming stack arguments to be
	/// loaded from the stack. This is used in tail call lowering to protect
	/// stack arguments from being clobbered.
	SDValue getStackArgumentTokenFactor(SDValue Chain);

	SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src,
	SDValue Size, unsigned Align, bool isVol, bool AlwaysInline,
	bool isTailCall, MachinePointerInfo DstPtrInfo,
	MachinePointerInfo SrcPtrInfo);

	SDValue getMemmove(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src,
	SDValue Size, unsigned Align, bool isVol, bool isTailCall,
	MachinePointerInfo DstPtrInfo,
	MachinePointerInfo SrcPtrInfo);

	SDValue getMemset(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src,
	SDValue Size, unsigned Align, bool isVol, bool isTailCall,
	MachinePointerInfo DstPtrInfo);

	/// Helper function to make it easier to build SetCC's if you just
	/// have an ISD::CondCode instead of an SDValue.
	///
	SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS,
	ISD::CondCode Cond) {
	assert(LHS.getValueType().isVector() == RHS.getValueType().isVector() &&
	"Cannot compare scalars to vectors");
	assert(LHS.getValueType().isVector() == VT.isVector() &&
	"Cannot compare scalars to vectors");
	assert(Cond != ISD::SETCC_INVALID &&
	"Cannot create a setCC of an invalid node.");
	return getNode(ISD::SETCC, DL, VT, LHS, RHS, getCondCode(Cond));
	}

	/// Helper function to make it easier to build Select's if you just
	/// have operands and don't want to check for vector.
	SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS,
	SDValue RHS) {
	assert(LHS.getValueType() == RHS.getValueType() &&
	"Cannot use select on differing types");
	assert(VT.isVector() == LHS.getValueType().isVector() &&
	"Cannot mix vectors and scalars");
	return getNode(Cond.getValueType().isVector() ? ISD::VSELECT : ISD::SELECT, DL, VT,
	Cond, LHS, RHS);
	}

	/// Helper function to make it easier to build SelectCC's if you
	/// just have an ISD::CondCode instead of an SDValue.
	///
	SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True,
	SDValue False, ISD::CondCode Cond) {
	return getNode(ISD::SELECT_CC, DL, True.getValueType(),
	LHS, RHS, True, False, getCondCode(Cond));
	}

	/// VAArg produces a result and token chain, and takes a pointer
	/// and a source value as input.
	SDValue getVAArg(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr,
	SDValue SV, unsigned Align);

	/// Gets a node for an atomic cmpxchg op. There are two
	/// valid Opcodes. ISD::ATOMIC_CMO_SWAP produces the value loaded and a
	/// chain result. ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS produces the value loaded,
	/// a success flag (initially i1), and a chain.
	SDValue getAtomicCmpSwap(unsigned Opcode, const SDLoc &dl, EVT MemVT,
	SDVTList VTs, SDValue Chain, SDValue Ptr,
	SDValue Cmp, SDValue Swp, MachinePointerInfo PtrInfo,
	unsigned Alignment, AtomicOrdering SuccessOrdering,
	AtomicOrdering FailureOrdering,
	SyncScope::ID SSID);
	SDValue getAtomicCmpSwap(unsigned Opcode, const SDLoc &dl, EVT MemVT,
	SDVTList VTs, SDValue Chain, SDValue Ptr,
	SDValue Cmp, SDValue Swp, MachineMemOperand *MMO);

	/// Gets a node for an atomic op, produces result (if relevant)
	/// and chain and takes 2 operands.
	SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain,
	SDValue Ptr, SDValue Val, const Value *PtrVal,
	unsigned Alignment, AtomicOrdering Ordering,
	SyncScope::ID SSID);
	SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain,
	SDValue Ptr, SDValue Val, MachineMemOperand *MMO);

	/// Gets a node for an atomic op, produces result and chain and
	/// takes 1 operand.
	SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, EVT VT,
	SDValue Chain, SDValue Ptr, MachineMemOperand *MMO);

	/// Gets a node for an atomic op, produces result and chain and takes N
	/// operands.
	SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT,
	SDVTList VTList, ArrayRef<SDValue> Ops,
	MachineMemOperand *MMO);

	/// Creates a MemIntrinsicNode that may produce a
	/// result and takes a list of operands. Opcode may be INTRINSIC_VOID,
	/// INTRINSIC_W_CHAIN, or a target-specific opcode with a value not
	/// less than FIRST_TARGET_MEMORY_OPCODE.
	SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList,
	ArrayRef<SDValue> Ops, EVT MemVT,
	MachinePointerInfo PtrInfo, unsigned Align = 0,
	bool Vol = false, bool ReadMem = true,
	bool WriteMem = true, unsigned Size = 0);

	SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList,
	ArrayRef<SDValue> Ops, EVT MemVT,
	MachineMemOperand *MMO);

	/// Create a MERGE_VALUES node from the given operands.
	SDValue getMergeValues(ArrayRef<SDValue> Ops, const SDLoc &dl);

	/// Loads are not normal binary operators: their result type is not
	/// determined by their operands, and they produce a value AND a token chain.
	///
	/// This function will set the MOLoad flag on MMOFlags, but you can set it if
	/// you want. The MOStore flag must not be set.
	SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr,
	MachinePointerInfo PtrInfo, unsigned Alignment = 0,
	MachineMemOperand::Flags MMOFlags = MachineMemOperand::MONone,
	const AAMDNodes &AAInfo = AAMDNodes(),
	const MDNode *Ranges = nullptr);
	SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr,
	MachineMemOperand *MMO);
	SDValue
	getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain,
	SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT,
	unsigned Alignment = 0,
	MachineMemOperand::Flags MMOFlags = MachineMemOperand::MONone,
	const AAMDNodes &AAInfo = AAMDNodes());
	SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT,
	SDValue Chain, SDValue Ptr, EVT MemVT,
	MachineMemOperand *MMO);
	SDValue getIndexedLoad(SDValue OrigLoad, const SDLoc &dl, SDValue Base,
	SDValue Offset, ISD::MemIndexedMode AM);
	SDValue getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType, EVT VT,
	const SDLoc &dl, SDValue Chain, SDValue Ptr, SDValue Offset,
	MachinePointerInfo PtrInfo, EVT MemVT, unsigned Alignment = 0,
	MachineMemOperand::Flags MMOFlags = MachineMemOperand::MONone,
	const AAMDNodes &AAInfo = AAMDNodes(),
	const MDNode *Ranges = nullptr);
	SDValue getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType, EVT VT,
	const SDLoc &dl, SDValue Chain, SDValue Ptr, SDValue Offset,
	EVT MemVT, MachineMemOperand *MMO);

	/// Helper function to build ISD::STORE nodes.
	///
	/// This function will set the MOStore flag on MMOFlags, but you can set it if
	/// you want. The MOLoad and MOInvariant flags must not be set.
	SDValue
	getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr,
	MachinePointerInfo PtrInfo, unsigned Alignment = 0,
	MachineMemOperand::Flags MMOFlags = MachineMemOperand::MONone,
	const AAMDNodes &AAInfo = AAMDNodes());
	SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr,
	MachineMemOperand *MMO);
	SDValue
	getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr,
	MachinePointerInfo PtrInfo, EVT TVT, unsigned Alignment = 0,
	MachineMemOperand::Flags MMOFlags = MachineMemOperand::MONone,
	const AAMDNodes &AAInfo = AAMDNodes());
	SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val,
	SDValue Ptr, EVT TVT, MachineMemOperand *MMO);
	SDValue getIndexedStore(SDValue OrigStoe, const SDLoc &dl, SDValue Base,
	SDValue Offset, ISD::MemIndexedMode AM);

	/// Returns sum of the base pointer and offset.
	SDValue getMemBasePlusOffset(SDValue Base, unsigned Offset, const SDLoc &DL);

	SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr,
	SDValue Mask, SDValue Src0, EVT MemVT,
	MachineMemOperand *MMO, ISD::LoadExtType,
	bool IsExpanding = false);
	SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val,
	SDValue Ptr, SDValue Mask, EVT MemVT,
	MachineMemOperand *MMO, bool IsTruncating = false,
	bool IsCompressing = false);
	SDValue getMaskedGather(SDVTList VTs, EVT VT, const SDLoc &dl,
	ArrayRef<SDValue> Ops, MachineMemOperand *MMO);
	SDValue getMaskedScatter(SDVTList VTs, EVT VT, const SDLoc &dl,
	ArrayRef<SDValue> Ops, MachineMemOperand *MMO);

	/// Return (create a new or find existing) a target-specific node.
	/// TargetMemSDNode should be derived class from MemSDNode.
	template <class TargetMemSDNode>
	SDValue getTargetMemSDNode(SDVTList VTs, ArrayRef<SDValue> Ops,
	const SDLoc &dl, EVT MemVT,
	MachineMemOperand *MMO);

	/// Construct a node to track a Value* through the backend.
	SDValue getSrcValue(const Value *v);

	/// Return an MDNodeSDNode which holds an MDNode.
	SDValue getMDNode(const MDNode *MD);

	/// Return a bitcast using the SDLoc of the value operand, and casting to the
	/// provided type. Use getNode to set a custom SDLoc.
	SDValue getBitcast(EVT VT, SDValue V);

	/// Return an AddrSpaceCastSDNode.
	SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS,
	unsigned DestAS);

	/// Return the specified value casted to
	/// the target's desired shift amount type.
	SDValue getShiftAmountOperand(EVT LHSTy, SDValue Op);

	/// Expand the specified \c ISD::VAARG node as the Legalize pass would.
	SDValue expandVAArg(SDNode *Node);

	/// Expand the specified \c ISD::VACOPY node as the Legalize pass would.
	SDValue expandVACopy(SDNode *Node);

	/// Mutate the specified node in-place to have the
	/// specified operands. If the resultant node already exists in the DAG,
	/// this does not modify the specified node, instead it returns the node that
	/// already exists. If the resultant node does not exist in the DAG, the
	/// input node is returned. As a degenerate case, if you specify the same
	/// input operands as the node already has, the input node is returned.
	SDNode UpdateNodeOperands(SDNode N, SDValue Op);
	SDNode UpdateNodeOperands(SDNode N, SDValue Op1, SDValue Op2);
	SDNode UpdateNodeOperands(SDNode N, SDValue Op1, SDValue Op2,
	SDValue Op3);
	SDNode UpdateNodeOperands(SDNode N, SDValue Op1, SDValue Op2,
	SDValue Op3, SDValue Op4);
	SDNode UpdateNodeOperands(SDNode N, SDValue Op1, SDValue Op2,
	SDValue Op3, SDValue Op4, SDValue Op5);
	SDNode UpdateNodeOperands(SDNode N, ArrayRef<SDValue> Ops);

	/// These are used for target selectors to mutate the
	/// specified node to have the specified return type, Target opcode, and
	/// operands. Note that target opcodes are stored as
	/// ~TargetOpcode in the node opcode field. The resultant node is returned.
	SDNode SelectNodeTo(SDNode N, unsigned TargetOpc, EVT VT);
	SDNode SelectNodeTo(SDNode N, unsigned TargetOpc, EVT VT, SDValue Op1);
	SDNode SelectNodeTo(SDNode N, unsigned TargetOpc, EVT VT,
	SDValue Op1, SDValue Op2);
	SDNode SelectNodeTo(SDNode N, unsigned TargetOpc, EVT VT,
	SDValue Op1, SDValue Op2, SDValue Op3);
	SDNode SelectNodeTo(SDNode N, unsigned TargetOpc, EVT VT,
	ArrayRef<SDValue> Ops);
	SDNode SelectNodeTo(SDNode N, unsigned TargetOpc, EVT VT1, EVT VT2);
	SDNode SelectNodeTo(SDNode N, unsigned TargetOpc, EVT VT1,
	EVT VT2, ArrayRef<SDValue> Ops);
	SDNode SelectNodeTo(SDNode N, unsigned TargetOpc, EVT VT1,
	EVT VT2, EVT VT3, ArrayRef<SDValue> Ops);
	SDNode SelectNodeTo(SDNode N, unsigned TargetOpc, EVT VT1,
	EVT VT2, SDValue Op1);
	SDNode SelectNodeTo(SDNode N, unsigned TargetOpc, EVT VT1,
	EVT VT2, SDValue Op1, SDValue Op2);
	SDNode SelectNodeTo(SDNode N, unsigned TargetOpc, SDVTList VTs,
	ArrayRef<SDValue> Ops);

	/// This mutates the specified node to have the specified
	/// return type, opcode, and operands.
	SDNode MorphNodeTo(SDNode N, unsigned Opc, SDVTList VTs,
	ArrayRef<SDValue> Ops);

	/// Mutate the specified strict FP node to its non-strict equivalent,
	/// unlinking the node from its chain and dropping the metadata arguments.
	/// The node must be a strict FP node.
	SDNode mutateStrictFPToFP(SDNode Node);

	/// These are used for target selectors to create a new node
	/// with specified return type(s), MachineInstr opcode, and operands.
	///
	/// Note that getMachineNode returns the resultant node. If there is already
	/// a node of the specified opcode and operands, it returns that node instead
	/// of the current one.
	MachineSDNode *getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT);
	MachineSDNode *getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT,
	SDValue Op1);
	MachineSDNode *getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT,
	SDValue Op1, SDValue Op2);
	MachineSDNode *getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT,
	SDValue Op1, SDValue Op2, SDValue Op3);
	MachineSDNode *getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT,
	ArrayRef<SDValue> Ops);
	MachineSDNode *getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT1,
	EVT VT2, SDValue Op1, SDValue Op2);
	MachineSDNode *getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT1,
	EVT VT2, SDValue Op1, SDValue Op2, SDValue Op3);
	MachineSDNode *getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT1,
	EVT VT2, ArrayRef<SDValue> Ops);
	MachineSDNode *getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT1,
	EVT VT2, EVT VT3, SDValue Op1, SDValue Op2);
	MachineSDNode *getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT1,
	EVT VT2, EVT VT3, SDValue Op1, SDValue Op2,
	SDValue Op3);
	MachineSDNode *getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT1,
	EVT VT2, EVT VT3, ArrayRef<SDValue> Ops);
	MachineSDNode *getMachineNode(unsigned Opcode, const SDLoc &dl,
	ArrayRef<EVT> ResultTys, ArrayRef<SDValue> Ops);
	MachineSDNode *getMachineNode(unsigned Opcode, const SDLoc &dl, SDVTList VTs,
	ArrayRef<SDValue> Ops);

	/// A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
	SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT,
	SDValue Operand);

	/// A convenience function for creating TargetInstrInfo::INSERT_SUBREG nodes.
	SDValue getTargetInsertSubreg(int SRIdx, const SDLoc &DL, EVT VT,
	SDValue Operand, SDValue Subreg);

	/// Get the specified node if it's already available, or else return NULL.
	SDNode *getNodeIfExists(unsigned Opcode, SDVTList VTs, ArrayRef<SDValue> Ops,
	const SDNodeFlags Flags = SDNodeFlags());

	/// Creates a SDDbgValue node.
	SDDbgValue getDbgValue(MDNode Var, MDNode Expr, SDNode N, unsigned R,
	bool IsIndirect, uint64_t Off, const DebugLoc &DL,
	unsigned O);

	/// Constant
	SDDbgValue getConstantDbgValue(MDNode Var, MDNode Expr, const Value C,
	uint64_t Off, const DebugLoc &DL, unsigned O);

	/// FrameIndex
	SDDbgValue getFrameIndexDbgValue(MDNode Var, MDNode *Expr, unsigned FI,
	uint64_t Off, const DebugLoc &DL,
	unsigned O);

	/// Remove the specified node from the system. If any of its
	/// operands then becomes dead, remove them as well. Inform UpdateListener
	/// for each node deleted.
	void RemoveDeadNode(SDNode *N);

	/// This method deletes the unreachable nodes in the
	/// given list, and any nodes that become unreachable as a result.
	void RemoveDeadNodes(SmallVectorImpl<SDNode *> &DeadNodes);

	/// Modify anything using 'From' to use 'To' instead.
	/// This can cause recursive merging of nodes in the DAG. Use the first
	/// version if 'From' is known to have a single result, use the second
	/// if you have two nodes with identical results (or if 'To' has a superset
	/// of the results of 'From'), use the third otherwise.
	///
	/// These methods all take an optional UpdateListener, which (if not null) is
	/// informed about nodes that are deleted and modified due to recursive
	/// changes in the dag.
	///
	/// These functions only replace all existing uses. It's possible that as
	/// these replacements are being performed, CSE may cause the From node
	/// to be given new uses. These new uses of From are left in place, and
	/// not automatically transferred to To.
	///
	void ReplaceAllUsesWith(SDValue From, SDValue Op);
	void ReplaceAllUsesWith(SDNode From, SDNode To);
	void ReplaceAllUsesWith(SDNode From, const SDValue To);

	/// Replace any uses of From with To, leaving
	/// uses of other values produced by From.Val alone.
	void ReplaceAllUsesOfValueWith(SDValue From, SDValue To);

	/// Like ReplaceAllUsesOfValueWith, but for multiple values at once.
	/// This correctly handles the case where
	/// there is an overlap between the From values and the To values.
	void ReplaceAllUsesOfValuesWith(const SDValue From, const SDValue To,
	unsigned Num);

	/// If an existing load has uses of its chain, create a token factor node with
	/// that chain and the new memory node's chain and update users of the old
	/// chain to the token factor. This ensures that the new memory node will have
	- /// the same relative memory dependency position as the old load.
	- void makeEquivalentMemoryOrdering(LoadSDNode *Old, SDValue New);
	+ /// the same relative memory dependency position as the old load. Returns the
	+ /// new merged load chain.
	+ SDValue makeEquivalentMemoryOrdering(LoadSDNode *Old, SDValue New);

	/// Topological-sort the AllNodes list and a
	/// assign a unique node id for each node in the DAG based on their
	/// topological order. Returns the number of nodes.
	unsigned AssignTopologicalOrder();

	/// Move node N in the AllNodes list to be immediately
	/// before the given iterator Position. This may be used to update the
	/// topological ordering when the list of nodes is modified.
	void RepositionNode(allnodes_iterator Position, SDNode *N) {
	AllNodes.insert(Position, AllNodes.remove(N));
	}

	/// Returns an APFloat semantics tag appropriate for the given type. If VT is
	/// a vector type, the element semantics are returned.
	static const fltSemantics &EVTToAPFloatSemantics(EVT VT) {
	switch (VT.getScalarType().getSimpleVT().SimpleTy) {
	default: llvm_unreachable("Unknown FP format");
	case MVT::f16: return APFloat::IEEEhalf();
	case MVT::f32: return APFloat::IEEEsingle();
	case MVT::f64: return APFloat::IEEEdouble();
	case MVT::f80: return APFloat::x87DoubleExtended();
	case MVT::f128: return APFloat::IEEEquad();
	case MVT::ppcf128: return APFloat::PPCDoubleDouble();
	}
	}

	/// Add a dbg_value SDNode. If SD is non-null that means the
	/// value is produced by SD.
	void AddDbgValue(SDDbgValue DB, SDNode SD, bool isParameter);

	/// Get the debug values which reference the given SDNode.
	ArrayRef<SDDbgValue> GetDbgValues(const SDNode SD) {
	return DbgInfo->getSDDbgValues(SD);
	}

	private:
	/// Transfer SDDbgValues. Called via ReplaceAllUses{OfValue}?With
	void TransferDbgValues(SDValue From, SDValue To);

	public:
	/// Return true if there are any SDDbgValue nodes associated
	/// with this SelectionDAG.
	bool hasDebugValues() const { return !DbgInfo->empty(); }

	SDDbgInfo::DbgIterator DbgBegin() { return DbgInfo->DbgBegin(); }
	SDDbgInfo::DbgIterator DbgEnd() { return DbgInfo->DbgEnd(); }

	SDDbgInfo::DbgIterator ByvalParmDbgBegin() {
	return DbgInfo->ByvalParmDbgBegin();
	}

	SDDbgInfo::DbgIterator ByvalParmDbgEnd() {
	return DbgInfo->ByvalParmDbgEnd();
	}

	void dump() const;

	/// Create a stack temporary, suitable for holding the specified value type.
	/// If minAlign is specified, the slot size will have at least that alignment.
	SDValue CreateStackTemporary(EVT VT, unsigned minAlign = 1);

	/// Create a stack temporary suitable for holding either of the specified
	/// value types.
	SDValue CreateStackTemporary(EVT VT1, EVT VT2);

	SDValue FoldSymbolOffset(unsigned Opcode, EVT VT,
	const GlobalAddressSDNode *GA,
	const SDNode *N2);

	SDValue FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL, EVT VT,
	SDNode Cst1, SDNode Cst2);

	SDValue FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL, EVT VT,
	const ConstantSDNode *Cst1,
	const ConstantSDNode *Cst2);

	SDValue FoldConstantVectorArithmetic(unsigned Opcode, const SDLoc &DL, EVT VT,
	ArrayRef<SDValue> Ops,
	const SDNodeFlags Flags = SDNodeFlags());

	/// Constant fold a setcc to true or false.
	SDValue FoldSetCC(EVT VT, SDValue N1, SDValue N2, ISD::CondCode Cond,
	const SDLoc &dl);

	/// Return true if the sign bit of Op is known to be zero.
	/// We use this predicate to simplify operations downstream.
	bool SignBitIsZero(SDValue Op, unsigned Depth = 0) const;

	/// Return true if 'Op & Mask' is known to be zero. We
	/// use this predicate to simplify operations downstream. Op and Mask are
	/// known to be the same type.
	bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth = 0)
	const;

	/// Determine which bits of Op are known to be either zero or one and return
	/// them in Known. For vectors, the known bits are those that are shared by
	/// every vector element.
	/// Targets can implement the computeKnownBitsForTargetNode method in the
	/// TargetLowering class to allow target nodes to be understood.
	void computeKnownBits(SDValue Op, KnownBits &Known, unsigned Depth = 0) const;

	/// Determine which bits of Op are known to be either zero or one and return
	/// them in Known. The DemandedElts argument allows us to only collect the
	/// known bits that are shared by the requested vector elements.
	/// Targets can implement the computeKnownBitsForTargetNode method in the
	/// TargetLowering class to allow target nodes to be understood.
	void computeKnownBits(SDValue Op, KnownBits &Known, const APInt &DemandedElts,
	unsigned Depth = 0) const;

	/// Used to represent the possible overflow behavior of an operation.
	/// Never: the operation cannot overflow.
	/// Always: the operation will always overflow.
	/// Sometime: the operation may or may not overflow.
	enum OverflowKind {
	OFK_Never,
	OFK_Sometime,
	OFK_Always,
	};

	/// Determine if the result of the addition of 2 node can overflow.
	OverflowKind computeOverflowKind(SDValue N0, SDValue N1) const;

	/// Test if the given value is known to have exactly one bit set. This differs
	/// from computeKnownBits in that it doesn't necessarily determine which bit
	/// is set.
	bool isKnownToBeAPowerOfTwo(SDValue Val) const;

	/// Return the number of times the sign bit of the register is replicated into
	/// the other bits. We know that at least 1 bit is always equal to the sign
	/// bit (itself), but other cases can give us information. For example,
	/// immediately after an "SRA X, 2", we know that the top 3 bits are all equal
	/// to each other, so we return 3. Targets can implement the
	/// ComputeNumSignBitsForTarget method in the TargetLowering class to allow
	/// target nodes to be understood.
	unsigned ComputeNumSignBits(SDValue Op, unsigned Depth = 0) const;

	/// Return the number of times the sign bit of the register is replicated into
	/// the other bits. We know that at least 1 bit is always equal to the sign
	/// bit (itself), but other cases can give us information. For example,
	/// immediately after an "SRA X, 2", we know that the top 3 bits are all equal
	/// to each other, so we return 3. The DemandedElts argument allows
	/// us to only collect the minimum sign bits of the requested vector elements.
	/// Targets can implement the ComputeNumSignBitsForTarget method in the
	/// TargetLowering class to allow target nodes to be understood.
	unsigned ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
	unsigned Depth = 0) const;

	/// Return true if the specified operand is an ISD::ADD with a ConstantSDNode
	/// on the right-hand side, or if it is an ISD::OR with a ConstantSDNode that
	/// is guaranteed to have the same semantics as an ADD. This handles the
	/// equivalence:
	/// X\|Cst == X+Cst iff X&Cst = 0.
	bool isBaseWithConstantOffset(SDValue Op) const;

	/// Test whether the given SDValue is known to never be NaN.
	bool isKnownNeverNaN(SDValue Op) const;

	/// Test whether the given SDValue is known to never be positive or negative
	/// zero.
	bool isKnownNeverZero(SDValue Op) const;

	/// Test whether two SDValues are known to compare equal. This
	/// is true if they are the same value, or if one is negative zero and the
	/// other positive zero.
	bool isEqualTo(SDValue A, SDValue B) const;

	/// Return true if A and B have no common bits set. As an example, this can
	/// allow an 'add' to be transformed into an 'or'.
	bool haveNoCommonBitsSet(SDValue A, SDValue B) const;

	/// Utility function used by legalize and lowering to
	/// "unroll" a vector operation by splitting out the scalars and operating
	/// on each element individually. If the ResNE is 0, fully unroll the vector
	/// op. If ResNE is less than the width of the vector op, unroll up to ResNE.
	/// If the ResNE is greater than the width of the vector op, unroll the
	/// vector op and fill the end of the resulting vector with UNDEFS.
	SDValue UnrollVectorOp(SDNode *N, unsigned ResNE = 0);

	/// Return true if loads are next to each other and can be
	/// merged. Check that both are nonvolatile and if LD is loading
	/// 'Bytes' bytes from a location that is 'Dist' units away from the
	/// location that the 'Base' load is loading from.
	bool areNonVolatileConsecutiveLoads(LoadSDNode LD, LoadSDNode Base,
	unsigned Bytes, int Dist) const;

	/// Infer alignment of a load / store address. Return 0 if
	/// it cannot be inferred.
	unsigned InferPtrAlignment(SDValue Ptr) const;

	/// Compute the VTs needed for the low/hi parts of a type
	/// which is split (or expanded) into two not necessarily identical pieces.
	std::pair<EVT, EVT> GetSplitDestVTs(const EVT &VT) const;

	/// Split the vector with EXTRACT_SUBVECTOR using the provides
	/// VTs and return the low/high part.
	std::pair<SDValue, SDValue> SplitVector(const SDValue &N, const SDLoc &DL,
	const EVT &LoVT, const EVT &HiVT);

	/// Split the vector with EXTRACT_SUBVECTOR and return the low/high part.
	std::pair<SDValue, SDValue> SplitVector(const SDValue &N, const SDLoc &DL) {
	EVT LoVT, HiVT;
	std::tie(LoVT, HiVT) = GetSplitDestVTs(N.getValueType());
	return SplitVector(N, DL, LoVT, HiVT);
	}

	/// Split the node's operand with EXTRACT_SUBVECTOR and
	/// return the low/high part.
	std::pair<SDValue, SDValue> SplitVectorOperand(const SDNode *N, unsigned OpNo)
	{
	return SplitVector(N->getOperand(OpNo), SDLoc(N));
	}

	/// Append the extracted elements from Start to Count out of the vector Op
	/// in Args. If Count is 0, all of the elements will be extracted.
	void ExtractVectorElements(SDValue Op, SmallVectorImpl<SDValue> &Args,
	unsigned Start = 0, unsigned Count = 0);

	/// Compute the default alignment value for the given type.
	unsigned getEVTAlignment(EVT MemoryVT) const;

	/// Test whether the given value is a constant int or similar node.
	SDNode *isConstantIntBuildVectorOrConstantInt(SDValue N);

	/// Test whether the given value is a constant FP or similar node.
	SDNode *isConstantFPBuildVectorOrConstantFP(SDValue N);

	/// \returns true if \p N is any kind of constant or build_vector of
	/// constants, int or float. If a vector, it may not necessarily be a splat.
	inline bool isConstantValueOfAnyType(SDValue N) {
	return isConstantIntBuildVectorOrConstantInt(N) \|\|
	isConstantFPBuildVectorOrConstantFP(N);
	}

	private:
	void InsertNode(SDNode *N);
	bool RemoveNodeFromCSEMaps(SDNode *N);
	void AddModifiedNodeToCSEMaps(SDNode *N);
	SDNode FindModifiedNodeSlot(SDNode N, SDValue Op, void *&InsertPos);
	SDNode FindModifiedNodeSlot(SDNode N, SDValue Op1, SDValue Op2,
	void *&InsertPos);
	SDNode FindModifiedNodeSlot(SDNode N, ArrayRef<SDValue> Ops,
	void *&InsertPos);
	SDNode UpdateSDLocOnMergeSDNode(SDNode N, const SDLoc &loc);

	void DeleteNodeNotInCSEMaps(SDNode *N);
	void DeallocateNode(SDNode *N);

	void allnodes_clear();

	/// Look up the node specified by ID in CSEMap. If it exists, return it. If
	/// not, return the insertion token that will make insertion faster. This
	/// overload is for nodes other than Constant or ConstantFP, use the other one
	/// for those.
	SDNode FindNodeOrInsertPos(const FoldingSetNodeID &ID, void &InsertPos);

	/// Look up the node specified by ID in CSEMap. If it exists, return it. If
	/// not, return the insertion token that will make insertion faster. Performs
	/// additional processing for constant nodes.
	SDNode *FindNodeOrInsertPos(const FoldingSetNodeID &ID, const SDLoc &DL,
	void *&InsertPos);

	/// List of non-single value types.
	FoldingSet<SDVTListNode> VTListMap;

	/// Maps to auto-CSE operations.
	std::vector<CondCodeSDNode*> CondCodeNodes;

	std::vector<SDNode*> ValueTypeNodes;
	std::map<EVT, SDNode*, EVT::compareRawBits> ExtendedValueTypeNodes;
	StringMap<SDNode*> ExternalSymbols;

	std::map<std::pair<std::string, unsigned char>,SDNode*> TargetExternalSymbols;
	DenseMap<MCSymbol , SDNode > MCSymbols;
	};

	template <> struct GraphTraits<SelectionDAG> : public GraphTraits<SDNode> {
	using nodes_iterator = pointer_iterator<SelectionDAG::allnodes_iterator>;

	static nodes_iterator nodes_begin(SelectionDAG *G) {
	return nodes_iterator(G->allnodes_begin());
	}

	static nodes_iterator nodes_end(SelectionDAG *G) {
	return nodes_iterator(G->allnodes_end());
	}
	};

	template <class TargetMemSDNode>
	SDValue SelectionDAG::getTargetMemSDNode(SDVTList VTs,
	ArrayRef<SDValue> Ops,
	const SDLoc &dl, EVT MemVT,
	MachineMemOperand *MMO) {
	/// Compose node ID and try to find an existing node.
	FoldingSetNodeID ID;
	unsigned Opcode =
	TargetMemSDNode(dl.getIROrder(), DebugLoc(), VTs, MemVT, MMO).getOpcode();
	ID.AddInteger(Opcode);
	ID.AddPointer(VTs.VTs);
	for (auto& Op : Ops) {
	ID.AddPointer(Op.getNode());
	ID.AddInteger(Op.getResNo());
	}
	ID.AddInteger(MemVT.getRawBits());
	ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
	ID.AddInteger(getSyntheticNodeSubclassData<TargetMemSDNode>(
	dl.getIROrder(), VTs, MemVT, MMO));

	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
	cast<TargetMemSDNode>(E)->refineAlignment(MMO);
	return SDValue(E, 0);
	}

	/// Existing node was not found. Create a new one.
	auto *N = newSDNode<TargetMemSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs,
	MemVT, MMO);
	createOperands(N, Ops);
	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	return SDValue(N, 0);
	}

	} // end namespace llvm

	#endif // LLVM_CODEGEN_SELECTIONDAG_H
	diff --git a/include/llvm/ExecutionEngine/Orc/LazyEmittingLayer.h b/include/llvm/ExecutionEngine/Orc/LazyEmittingLayer.h
	index 6c951fab6185..b7e462e85d9d 100644
	--- a/include/llvm/ExecutionEngine/Orc/LazyEmittingLayer.h
	+++ b/include/llvm/ExecutionEngine/Orc/LazyEmittingLayer.h
	@@ -1,272 +1,272 @@
	//===- LazyEmittingLayer.h - Lazily emit IR to lower JIT layers -- C++ --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// Contains the definition for a lazy-emitting layer for the JIT.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_EXECUTIONENGINE_ORC_LAZYEMITTINGLAYER_H
	#define LLVM_EXECUTIONENGINE_ORC_LAZYEMITTINGLAYER_H

	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/StringMap.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/ExecutionEngine/JITSymbol.h"
	#include "llvm/IR/GlobalValue.h"
	#include "llvm/IR/Mangler.h"
	#include "llvm/IR/Module.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/raw_ostream.h"
	#include <algorithm>
	#include <cassert>
	#include <list>
	#include <memory>
	#include <string>

	namespace llvm {
	namespace orc {

	/// @brief Lazy-emitting IR layer.
	///
	/// This layer accepts LLVM IR Modules (via addModule), but does not
	/// immediately emit them the layer below. Instead, emissing to the base layer
	/// is deferred until the first time the client requests the address (via
	/// JITSymbol::getAddress) for a symbol contained in this layer.
	template <typename BaseLayerT> class LazyEmittingLayer {
	public:

	using BaseLayerHandleT = typename BaseLayerT::ModuleHandleT;

	private:
	class EmissionDeferredModule {
	public:
	EmissionDeferredModule(std::shared_ptr<Module> M,
	std::shared_ptr<JITSymbolResolver> Resolver)
	: M(std::move(M)), Resolver(std::move(Resolver)) {}

	JITSymbol find(StringRef Name, bool ExportedSymbolsOnly, BaseLayerT &B) {
	switch (EmitState) {
	case NotEmitted:
	if (auto GV = searchGVs(Name, ExportedSymbolsOnly)) {
	// Create a std::string version of Name to capture here - the argument
	// (a StringRef) may go away before the lambda is executed.
	// FIXME: Use capture-init when we move to C++14.
	std::string PName = Name;
	JITSymbolFlags Flags = JITSymbolFlags::fromGlobalValue(*GV);
	auto GetAddress =
	[this, ExportedSymbolsOnly, PName, &B]() -> Expected<JITTargetAddress> {
	if (this->EmitState == Emitting)
	return 0;
	else if (this->EmitState == NotEmitted) {
	this->EmitState = Emitting;
	if (auto HandleOrErr = this->emitToBaseLayer(B))
	Handle = std::move(*HandleOrErr);
	else
	return HandleOrErr.takeError();
	this->EmitState = Emitted;
	}
	if (auto Sym = B.findSymbolIn(Handle, PName, ExportedSymbolsOnly))
	return Sym.getAddress();
	else if (auto Err = Sym.takeError())
	return std::move(Err);
	else
	llvm_unreachable("Successful symbol lookup should return "
	"definition address here");
	};
	return JITSymbol(std::move(GetAddress), Flags);
	} else
	return nullptr;
	case Emitting:
	// Calling "emit" can trigger a recursive call to 'find' (e.g. to check
	// for pre-existing definitions of common-symbol), but any symbol in
	// this module would already have been found internally (in the
	// RuntimeDyld that did the lookup), so just return a nullptr here.
	return nullptr;
	case Emitted:
	return B.findSymbolIn(Handle, Name, ExportedSymbolsOnly);
	}
	llvm_unreachable("Invalid emit-state.");
	}

	- void removeModuleFromBaseLayer(BaseLayerT &BaseLayer) {
	- if (EmitState != NotEmitted)
	- BaseLayer.removeModule(Handle);
	+ Error removeModuleFromBaseLayer(BaseLayerT& BaseLayer) {
	+ return EmitState != NotEmitted ? BaseLayer.removeModule(Handle)
	+ : Error::success();
	}

	void emitAndFinalize(BaseLayerT &BaseLayer) {
	assert(EmitState != Emitting &&
	"Cannot emitAndFinalize while already emitting");
	if (EmitState == NotEmitted) {
	EmitState = Emitting;
	Handle = emitToBaseLayer(BaseLayer);
	EmitState = Emitted;
	}
	BaseLayer.emitAndFinalize(Handle);
	}

	private:

	const GlobalValue* searchGVs(StringRef Name,
	bool ExportedSymbolsOnly) const {
	// FIXME: We could clean all this up if we had a way to reliably demangle
	// names: We could just demangle name and search, rather than
	// mangling everything else.

	// If we have already built the mangled name set then just search it.
	if (MangledSymbols) {
	auto VI = MangledSymbols->find(Name);
	if (VI == MangledSymbols->end())
	return nullptr;
	auto GV = VI->second;
	if (!ExportedSymbolsOnly \|\| GV->hasDefaultVisibility())
	return GV;
	return nullptr;
	}

	// If we haven't built the mangled name set yet, try to build it. As an
	// optimization this will leave MangledNames set to nullptr if we find
	// Name in the process of building the set.
	return buildMangledSymbols(Name, ExportedSymbolsOnly);
	}

	Expected<BaseLayerHandleT> emitToBaseLayer(BaseLayerT &BaseLayer) {
	// We don't need the mangled names set any more: Once we've emitted this
	// to the base layer we'll just look for symbols there.
	MangledSymbols.reset();
	return BaseLayer.addModule(std::move(M), std::move(Resolver));
	}

	// If the mangled name of the given GlobalValue matches the given search
	// name (and its visibility conforms to the ExportedSymbolsOnly flag) then
	// return the symbol. Otherwise, add the mangled name to the Names map and
	// return nullptr.
	const GlobalValue* addGlobalValue(StringMap<const GlobalValue*> &Names,
	const GlobalValue &GV,
	const Mangler &Mang, StringRef SearchName,
	bool ExportedSymbolsOnly) const {
	// Modules don't "provide" decls or common symbols.
	if (GV.isDeclaration() \|\| GV.hasCommonLinkage())
	return nullptr;

	// Mangle the GV name.
	std::string MangledName;
	{
	raw_string_ostream MangledNameStream(MangledName);
	Mang.getNameWithPrefix(MangledNameStream, &GV, false);
	}

	// Check whether this is the name we were searching for, and if it is then
	// bail out early.
	if (MangledName == SearchName)
	if (!ExportedSymbolsOnly \|\| GV.hasDefaultVisibility())
	return &GV;

	// Otherwise add this to the map for later.
	Names[MangledName] = &GV;
	return nullptr;
	}

	// Build the MangledSymbols map. Bails out early (with MangledSymbols left set
	// to nullptr) if the given SearchName is found while building the map.
	const GlobalValue* buildMangledSymbols(StringRef SearchName,
	bool ExportedSymbolsOnly) const {
	assert(!MangledSymbols && "Mangled symbols map already exists?");

	auto Symbols = llvm::make_unique<StringMap<const GlobalValue*>>();

	Mangler Mang;

	for (const auto &GO : M->global_objects())
	if (auto GV = addGlobalValue(*Symbols, GO, Mang, SearchName,
	ExportedSymbolsOnly))
	return GV;

	MangledSymbols = std::move(Symbols);
	return nullptr;
	}

	enum { NotEmitted, Emitting, Emitted } EmitState = NotEmitted;
	BaseLayerHandleT Handle;
	std::shared_ptr<Module> M;
	std::shared_ptr<JITSymbolResolver> Resolver;
	mutable std::unique_ptr<StringMap<const GlobalValue*>> MangledSymbols;
	};

	using ModuleListT = std::list<std::unique_ptr<EmissionDeferredModule>>;

	BaseLayerT &BaseLayer;
	ModuleListT ModuleList;

	public:

	/// @brief Handle to a loaded module.
	using ModuleHandleT = typename ModuleListT::iterator;

	/// @brief Construct a lazy emitting layer.
	LazyEmittingLayer(BaseLayerT &BaseLayer) : BaseLayer(BaseLayer) {}

	/// @brief Add the given module to the lazy emitting layer.
	Expected<ModuleHandleT>
	addModule(std::shared_ptr<Module> M,
	std::shared_ptr<JITSymbolResolver> Resolver) {
	return ModuleList.insert(
	ModuleList.end(),
	llvm::make_unique<EmissionDeferredModule>(std::move(M),
	std::move(Resolver)));
	}

	/// @brief Remove the module represented by the given handle.
	///
	/// This method will free the memory associated with the given module, both
	/// in this layer, and the base layer.
	Error removeModule(ModuleHandleT H) {
	- (*H)->removeModuleFromBaseLayer(BaseLayer);
	+ Error Err = (*H)->removeModuleFromBaseLayer(BaseLayer);
	ModuleList.erase(H);
	- return Error::success();
	+ return Err;
	}

	/// @brief Search for the given named symbol.
	/// @param Name The name of the symbol to search for.
	/// @param ExportedSymbolsOnly If true, search only for exported symbols.
	/// @return A handle for the given named symbol, if it exists.
	JITSymbol findSymbol(const std::string &Name, bool ExportedSymbolsOnly) {
	// Look for the symbol among existing definitions.
	if (auto Symbol = BaseLayer.findSymbol(Name, ExportedSymbolsOnly))
	return Symbol;

	// If not found then search the deferred modules. If any of these contain a
	// definition of 'Name' then they will return a JITSymbol that will emit
	// the corresponding module when the symbol address is requested.
	for (auto &DeferredMod : ModuleList)
	if (auto Symbol = DeferredMod->find(Name, ExportedSymbolsOnly, BaseLayer))
	return Symbol;

	// If no definition found anywhere return a null symbol.
	return nullptr;
	}

	/// @brief Get the address of the given symbol in the context of the of
	/// compiled modules represented by the handle H.
	JITSymbol findSymbolIn(ModuleHandleT H, const std::string &Name,
	bool ExportedSymbolsOnly) {
	return (*H)->find(Name, ExportedSymbolsOnly, BaseLayer);
	}

	/// @brief Immediately emit and finalize the module represented by the given
	/// handle.
	/// @param H Handle for module to emit/finalize.
	Error emitAndFinalize(ModuleHandleT H) {
	return (*H)->emitAndFinalize(BaseLayer);
	}
	};

	} // end namespace orc
	} // end namespace llvm

	#endif // LLVM_EXECUTIONENGINE_ORC_LAZYEMITTINGLAYER_H
	diff --git a/include/llvm/Object/COFFImportFile.h b/include/llvm/Object/COFFImportFile.h
	index 8e215b565fc4..cf9c80a06f49 100644
	--- a/include/llvm/Object/COFFImportFile.h
	+++ b/include/llvm/Object/COFFImportFile.h
	@@ -1,106 +1,108 @@
	//===- COFFImportFile.h - COFF short import file implementation -- C++ --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// COFF short import file is a special kind of file which contains
	// only symbol names for DLL-exported symbols. This class implements
	// exporting of Symbols to create libraries and a SymbolicFile
	// interface for the file type.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_OBJECT_COFF_IMPORT_FILE_H
	#define LLVM_OBJECT_COFF_IMPORT_FILE_H

	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/Object/COFF.h"
	#include "llvm/Object/IRObjectFile.h"
	#include "llvm/Object/ObjectFile.h"
	#include "llvm/Object/SymbolicFile.h"
	#include "llvm/Support/MemoryBuffer.h"
	#include "llvm/Support/raw_ostream.h"

	namespace llvm {
	namespace object {

	class COFFImportFile : public SymbolicFile {
	public:
	COFFImportFile(MemoryBufferRef Source)
	: SymbolicFile(ID_COFFImportFile, Source) {}

	static bool classof(Binary const *V) { return V->isCOFFImportFile(); }

	void moveSymbolNext(DataRefImpl &Symb) const override { ++Symb.p; }

	std::error_code printSymbolName(raw_ostream &OS,
	DataRefImpl Symb) const override {
	if (Symb.p == 0)
	OS << "__imp_";
	OS << StringRef(Data.getBufferStart() + sizeof(coff_import_header));
	return std::error_code();
	}

	uint32_t getSymbolFlags(DataRefImpl Symb) const override {
	return SymbolRef::SF_Global;
	}

	basic_symbol_iterator symbol_begin() const override {
	return BasicSymbolRef(DataRefImpl(), this);
	}

	basic_symbol_iterator symbol_end() const override {
	DataRefImpl Symb;
	Symb.p = isData() ? 1 : 2;
	return BasicSymbolRef(Symb, this);
	}

	const coff_import_header *getCOFFImportHeader() const {
	return reinterpret_cast<const object::coff_import_header *>(
	Data.getBufferStart());
	}

	private:
	bool isData() const {
	return getCOFFImportHeader()->getType() == COFF::IMPORT_DATA;
	}
	};

	struct COFFShortExport {
	std::string Name;
	std::string ExtName;
	+ std::string SymbolName;

	uint16_t Ordinal = 0;
	bool Noname = false;
	bool Data = false;
	bool Private = false;
	bool Constant = false;

	bool isWeak() {
	return ExtName.size() && ExtName != Name;
	}

	friend bool operator==(const COFFShortExport &L, const COFFShortExport &R) {
	return L.Name == R.Name && L.ExtName == R.ExtName &&
	L.Ordinal == R.Ordinal && L.Noname == R.Noname &&
	L.Data == R.Data && L.Private == R.Private;
	}

	friend bool operator!=(const COFFShortExport &L, const COFFShortExport &R) {
	return !(L == R);
	}
	};

	std::error_code writeImportLibrary(StringRef ImportName,
	StringRef Path,
	ArrayRef<COFFShortExport> Exports,
	- COFF::MachineTypes Machine);
	+ COFF::MachineTypes Machine,
	+ bool MakeWeakAliases);

	} // namespace object
	} // namespace llvm

	#endif
	diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
	index b973203a89b6..9539fd7c7559 100644
	--- a/lib/Analysis/ScalarEvolution.cpp
	+++ b/lib/Analysis/ScalarEvolution.cpp
	@@ -1,11393 +1,11423 @@
	//===- ScalarEvolution.cpp - Scalar Evolution Analysis --------------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file contains the implementation of the scalar evolution analysis
	// engine, which is used primarily to analyze expressions involving induction
	// variables in loops.
	//
	// There are several aspects to this library. First is the representation of
	// scalar expressions, which are represented as subclasses of the SCEV class.
	// These classes are used to represent certain types of subexpressions that we
	// can handle. We only create one SCEV of a particular shape, so
	// pointer-comparisons for equality are legal.
	//
	// One important aspect of the SCEV objects is that they are never cyclic, even
	// if there is a cycle in the dataflow for an expression (ie, a PHI node). If
	// the PHI node is one of the idioms that we can represent (e.g., a polynomial
	// recurrence) then we represent it directly as a recurrence node, otherwise we
	// represent it as a SCEVUnknown node.
	//
	// In addition to being able to represent expressions of various types, we also
	// have folders that are used to build the canonical representation for a
	// particular expression. These folders are capable of using a variety of
	// rewrite rules to simplify the expressions.
	//
	// Once the folders are defined, we can implement the more interesting
	// higher-level code, such as the code that recognizes PHI nodes of various
	// types, computes the execution count of a loop, etc.
	//
	// TODO: We should use these routines and value representations to implement
	// dependence analysis!
	//
	//===----------------------------------------------------------------------===//
	//
	// There are several good references for the techniques used in this analysis.
	//
	// Chains of recurrences -- a method to expedite the evaluation
	// of closed-form functions
	// Olaf Bachmann, Paul S. Wang, Eugene V. Zima
	//
	// On computational properties of chains of recurrences
	// Eugene V. Zima
	//
	// Symbolic Evaluation of Chains of Recurrences for Loop Optimization
	// Robert A. van Engelen
	//
	// Efficient Symbolic Analysis for Optimizing Compilers
	// Robert A. van Engelen
	//
	// Using the chains of recurrences algebra for data dependence testing and
	// induction variable substitution
	// MS Thesis, Johnie Birch
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/Analysis/ScalarEvolution.h"
	#include "llvm/ADT/Optional.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/ScopeExit.h"
	#include "llvm/ADT/Sequence.h"
	#include "llvm/ADT/SmallPtrSet.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/Analysis/AssumptionCache.h"
	#include "llvm/Analysis/ConstantFolding.h"
	#include "llvm/Analysis/InstructionSimplify.h"
	#include "llvm/Analysis/LoopInfo.h"
	#include "llvm/Analysis/ScalarEvolutionExpressions.h"
	#include "llvm/Analysis/TargetLibraryInfo.h"
	#include "llvm/Analysis/ValueTracking.h"
	#include "llvm/IR/ConstantRange.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/Dominators.h"
	#include "llvm/IR/GetElementPtrTypeIterator.h"
	#include "llvm/IR/GlobalAlias.h"
	#include "llvm/IR/GlobalVariable.h"
	#include "llvm/IR/InstIterator.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/LLVMContext.h"
	#include "llvm/IR/Metadata.h"
	#include "llvm/IR/Operator.h"
	#include "llvm/IR/PatternMatch.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/KnownBits.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Support/SaveAndRestore.h"
	#include "llvm/Support/raw_ostream.h"
	#include <algorithm>
	using namespace llvm;

	#define DEBUG_TYPE "scalar-evolution"

	STATISTIC(NumArrayLenItCounts,
	"Number of trip counts computed with array length");
	STATISTIC(NumTripCountsComputed,
	"Number of loops with predictable loop counts");
	STATISTIC(NumTripCountsNotComputed,
	"Number of loops without predictable loop counts");
	STATISTIC(NumBruteForceTripCountsComputed,
	"Number of loops with trip counts computed by force");

	static cl::opt<unsigned>
	MaxBruteForceIterations("scalar-evolution-max-iterations", cl::ReallyHidden,
	cl::desc("Maximum number of iterations SCEV will "
	"symbolically execute a constant "
	"derived loop"),
	cl::init(100));

	// FIXME: Enable this with EXPENSIVE_CHECKS when the test suite is clean.
	static cl::opt<bool>
	VerifySCEV("verify-scev",
	cl::desc("Verify ScalarEvolution's backedge taken counts (slow)"));
	static cl::opt<bool>
	VerifySCEVMap("verify-scev-maps",
	cl::desc("Verify no dangling value in ScalarEvolution's "
	"ExprValueMap (slow)"));

	static cl::opt<unsigned> MulOpsInlineThreshold(
	"scev-mulops-inline-threshold", cl::Hidden,
	cl::desc("Threshold for inlining multiplication operands into a SCEV"),
	cl::init(32));

	static cl::opt<unsigned> AddOpsInlineThreshold(
	"scev-addops-inline-threshold", cl::Hidden,
	cl::desc("Threshold for inlining addition operands into a SCEV"),
	cl::init(500));

	static cl::opt<unsigned> MaxSCEVCompareDepth(
	"scalar-evolution-max-scev-compare-depth", cl::Hidden,
	cl::desc("Maximum depth of recursive SCEV complexity comparisons"),
	cl::init(32));

	static cl::opt<unsigned> MaxSCEVOperationsImplicationDepth(
	"scalar-evolution-max-scev-operations-implication-depth", cl::Hidden,
	cl::desc("Maximum depth of recursive SCEV operations implication analysis"),
	cl::init(2));

	static cl::opt<unsigned> MaxValueCompareDepth(
	"scalar-evolution-max-value-compare-depth", cl::Hidden,
	cl::desc("Maximum depth of recursive value complexity comparisons"),
	cl::init(2));

	static cl::opt<unsigned>
	MaxArithDepth("scalar-evolution-max-arith-depth", cl::Hidden,
	cl::desc("Maximum depth of recursive arithmetics"),
	cl::init(32));

	static cl::opt<unsigned> MaxConstantEvolvingDepth(
	"scalar-evolution-max-constant-evolving-depth", cl::Hidden,
	cl::desc("Maximum depth of recursive constant evolving"), cl::init(32));

	static cl::opt<unsigned>
	MaxExtDepth("scalar-evolution-max-ext-depth", cl::Hidden,
	cl::desc("Maximum depth of recursive SExt/ZExt"),
	cl::init(8));

	+static cl::opt<unsigned>
	+ MaxAddRecSize("scalar-evolution-max-add-rec-size", cl::Hidden,
	+ cl::desc("Max coefficients in AddRec during evolving"),
	+ cl::init(16));
	+
	//===----------------------------------------------------------------------===//
	// SCEV class definitions
	//===----------------------------------------------------------------------===//

	//===----------------------------------------------------------------------===//
	// Implementation of the SCEV class.
	//

	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
	LLVM_DUMP_METHOD void SCEV::dump() const {
	print(dbgs());
	dbgs() << '\n';
	}
	#endif

	void SCEV::print(raw_ostream &OS) const {
	switch (static_cast<SCEVTypes>(getSCEVType())) {
	case scConstant:
	cast<SCEVConstant>(this)->getValue()->printAsOperand(OS, false);
	return;
	case scTruncate: {
	const SCEVTruncateExpr *Trunc = cast<SCEVTruncateExpr>(this);
	const SCEV *Op = Trunc->getOperand();
	OS << "(trunc " << Op->getType() << " " << Op << " to "
	<< *Trunc->getType() << ")";
	return;
	}
	case scZeroExtend: {
	const SCEVZeroExtendExpr *ZExt = cast<SCEVZeroExtendExpr>(this);
	const SCEV *Op = ZExt->getOperand();
	OS << "(zext " << Op->getType() << " " << Op << " to "
	<< *ZExt->getType() << ")";
	return;
	}
	case scSignExtend: {
	const SCEVSignExtendExpr *SExt = cast<SCEVSignExtendExpr>(this);
	const SCEV *Op = SExt->getOperand();
	OS << "(sext " << Op->getType() << " " << Op << " to "
	<< *SExt->getType() << ")";
	return;
	}
	case scAddRecExpr: {
	const SCEVAddRecExpr *AR = cast<SCEVAddRecExpr>(this);
	OS << "{" << *AR->getOperand(0);
	for (unsigned i = 1, e = AR->getNumOperands(); i != e; ++i)
	OS << ",+," << *AR->getOperand(i);
	OS << "}<";
	if (AR->hasNoUnsignedWrap())
	OS << "nuw><";
	if (AR->hasNoSignedWrap())
	OS << "nsw><";
	if (AR->hasNoSelfWrap() &&
	!AR->getNoWrapFlags((NoWrapFlags)(FlagNUW \| FlagNSW)))
	OS << "nw><";
	AR->getLoop()->getHeader()->printAsOperand(OS, /PrintType=/false);
	OS << ">";
	return;
	}
	case scAddExpr:
	case scMulExpr:
	case scUMaxExpr:
	case scSMaxExpr: {
	const SCEVNAryExpr *NAry = cast<SCEVNAryExpr>(this);
	const char *OpStr = nullptr;
	switch (NAry->getSCEVType()) {
	case scAddExpr: OpStr = " + "; break;
	case scMulExpr: OpStr = " * "; break;
	case scUMaxExpr: OpStr = " umax "; break;
	case scSMaxExpr: OpStr = " smax "; break;
	}
	OS << "(";
	for (SCEVNAryExpr::op_iterator I = NAry->op_begin(), E = NAry->op_end();
	I != E; ++I) {
	OS << **I;
	if (std::next(I) != E)
	OS << OpStr;
	}
	OS << ")";
	switch (NAry->getSCEVType()) {
	case scAddExpr:
	case scMulExpr:
	if (NAry->hasNoUnsignedWrap())
	OS << "<nuw>";
	if (NAry->hasNoSignedWrap())
	OS << "<nsw>";
	}
	return;
	}
	case scUDivExpr: {
	const SCEVUDivExpr *UDiv = cast<SCEVUDivExpr>(this);
	OS << "(" << UDiv->getLHS() << " /u " << UDiv->getRHS() << ")";
	return;
	}
	case scUnknown: {
	const SCEVUnknown *U = cast<SCEVUnknown>(this);
	Type *AllocTy;
	if (U->isSizeOf(AllocTy)) {
	OS << "sizeof(" << *AllocTy << ")";
	return;
	}
	if (U->isAlignOf(AllocTy)) {
	OS << "alignof(" << *AllocTy << ")";
	return;
	}

	Type *CTy;
	Constant *FieldNo;
	if (U->isOffsetOf(CTy, FieldNo)) {
	OS << "offsetof(" << *CTy << ", ";
	FieldNo->printAsOperand(OS, false);
	OS << ")";
	return;
	}

	// Otherwise just print it normally.
	U->getValue()->printAsOperand(OS, false);
	return;
	}
	case scCouldNotCompute:
	OS << "*COULDNOTCOMPUTE*";
	return;
	}
	llvm_unreachable("Unknown SCEV kind!");
	}

	Type *SCEV::getType() const {
	switch (static_cast<SCEVTypes>(getSCEVType())) {
	case scConstant:
	return cast<SCEVConstant>(this)->getType();
	case scTruncate:
	case scZeroExtend:
	case scSignExtend:
	return cast<SCEVCastExpr>(this)->getType();
	case scAddRecExpr:
	case scMulExpr:
	case scUMaxExpr:
	case scSMaxExpr:
	return cast<SCEVNAryExpr>(this)->getType();
	case scAddExpr:
	return cast<SCEVAddExpr>(this)->getType();
	case scUDivExpr:
	return cast<SCEVUDivExpr>(this)->getType();
	case scUnknown:
	return cast<SCEVUnknown>(this)->getType();
	case scCouldNotCompute:
	llvm_unreachable("Attempt to use a SCEVCouldNotCompute object!");
	}
	llvm_unreachable("Unknown SCEV kind!");
	}

	bool SCEV::isZero() const {
	if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(this))
	return SC->getValue()->isZero();
	return false;
	}

	bool SCEV::isOne() const {
	if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(this))
	return SC->getValue()->isOne();
	return false;
	}

	bool SCEV::isAllOnesValue() const {
	if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(this))
	return SC->getValue()->isMinusOne();
	return false;
	}

	bool SCEV::isNonConstantNegative() const {
	const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(this);
	if (!Mul) return false;

	// If there is a constant factor, it will be first.
	const SCEVConstant *SC = dyn_cast<SCEVConstant>(Mul->getOperand(0));
	if (!SC) return false;

	// Return true if the value is negative, this matches things like (-42 * V).
	return SC->getAPInt().isNegative();
	}

	SCEVCouldNotCompute::SCEVCouldNotCompute() :
	SCEV(FoldingSetNodeIDRef(), scCouldNotCompute) {}

	bool SCEVCouldNotCompute::classof(const SCEV *S) {
	return S->getSCEVType() == scCouldNotCompute;
	}

	const SCEV ScalarEvolution::getConstant(ConstantInt V) {
	FoldingSetNodeID ID;
	ID.AddInteger(scConstant);
	ID.AddPointer(V);
	void *IP = nullptr;
	if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S;
	SCEV *S = new (SCEVAllocator) SCEVConstant(ID.Intern(SCEVAllocator), V);
	UniqueSCEVs.InsertNode(S, IP);
	return S;
	}

	const SCEV *ScalarEvolution::getConstant(const APInt &Val) {
	return getConstant(ConstantInt::get(getContext(), Val));
	}

	const SCEV *
	ScalarEvolution::getConstant(Type *Ty, uint64_t V, bool isSigned) {
	IntegerType *ITy = cast<IntegerType>(getEffectiveSCEVType(Ty));
	return getConstant(ConstantInt::get(ITy, V, isSigned));
	}

	SCEVCastExpr::SCEVCastExpr(const FoldingSetNodeIDRef ID,
	unsigned SCEVTy, const SCEV op, Type ty)
	: SCEV(ID, SCEVTy), Op(op), Ty(ty) {}

	SCEVTruncateExpr::SCEVTruncateExpr(const FoldingSetNodeIDRef ID,
	const SCEV op, Type ty)
	: SCEVCastExpr(ID, scTruncate, op, ty) {
	assert((Op->getType()->isIntegerTy() \|\| Op->getType()->isPointerTy()) &&
	(Ty->isIntegerTy() \|\| Ty->isPointerTy()) &&
	"Cannot truncate non-integer value!");
	}

	SCEVZeroExtendExpr::SCEVZeroExtendExpr(const FoldingSetNodeIDRef ID,
	const SCEV op, Type ty)
	: SCEVCastExpr(ID, scZeroExtend, op, ty) {
	assert((Op->getType()->isIntegerTy() \|\| Op->getType()->isPointerTy()) &&
	(Ty->isIntegerTy() \|\| Ty->isPointerTy()) &&
	"Cannot zero extend non-integer value!");
	}

	SCEVSignExtendExpr::SCEVSignExtendExpr(const FoldingSetNodeIDRef ID,
	const SCEV op, Type ty)
	: SCEVCastExpr(ID, scSignExtend, op, ty) {
	assert((Op->getType()->isIntegerTy() \|\| Op->getType()->isPointerTy()) &&
	(Ty->isIntegerTy() \|\| Ty->isPointerTy()) &&
	"Cannot sign extend non-integer value!");
	}

	void SCEVUnknown::deleted() {
	// Clear this SCEVUnknown from various maps.
	SE->forgetMemoizedResults(this);

	// Remove this SCEVUnknown from the uniquing map.
	SE->UniqueSCEVs.RemoveNode(this);

	// Release the value.
	setValPtr(nullptr);
	}

	void SCEVUnknown::allUsesReplacedWith(Value *New) {
	// Clear this SCEVUnknown from various maps.
	SE->forgetMemoizedResults(this);

	// Remove this SCEVUnknown from the uniquing map.
	SE->UniqueSCEVs.RemoveNode(this);

	// Update this SCEVUnknown to point to the new value. This is needed
	// because there may still be outstanding SCEVs which still point to
	// this SCEVUnknown.
	setValPtr(New);
	}

	bool SCEVUnknown::isSizeOf(Type *&AllocTy) const {
	if (ConstantExpr *VCE = dyn_cast<ConstantExpr>(getValue()))
	if (VCE->getOpcode() == Instruction::PtrToInt)
	if (ConstantExpr *CE = dyn_cast<ConstantExpr>(VCE->getOperand(0)))
	if (CE->getOpcode() == Instruction::GetElementPtr &&
	CE->getOperand(0)->isNullValue() &&
	CE->getNumOperands() == 2)
	if (ConstantInt *CI = dyn_cast<ConstantInt>(CE->getOperand(1)))
	if (CI->isOne()) {
	AllocTy = cast<PointerType>(CE->getOperand(0)->getType())
	->getElementType();
	return true;
	}

	return false;
	}

	bool SCEVUnknown::isAlignOf(Type *&AllocTy) const {
	if (ConstantExpr *VCE = dyn_cast<ConstantExpr>(getValue()))
	if (VCE->getOpcode() == Instruction::PtrToInt)
	if (ConstantExpr *CE = dyn_cast<ConstantExpr>(VCE->getOperand(0)))
	if (CE->getOpcode() == Instruction::GetElementPtr &&
	CE->getOperand(0)->isNullValue()) {
	Type *Ty =
	cast<PointerType>(CE->getOperand(0)->getType())->getElementType();
	if (StructType *STy = dyn_cast<StructType>(Ty))
	if (!STy->isPacked() &&
	CE->getNumOperands() == 3 &&
	CE->getOperand(1)->isNullValue()) {
	if (ConstantInt *CI = dyn_cast<ConstantInt>(CE->getOperand(2)))
	if (CI->isOne() &&
	STy->getNumElements() == 2 &&
	STy->getElementType(0)->isIntegerTy(1)) {
	AllocTy = STy->getElementType(1);
	return true;
	}
	}
	}

	return false;
	}

	bool SCEVUnknown::isOffsetOf(Type &CTy, Constant &FieldNo) const {
	if (ConstantExpr *VCE = dyn_cast<ConstantExpr>(getValue()))
	if (VCE->getOpcode() == Instruction::PtrToInt)
	if (ConstantExpr *CE = dyn_cast<ConstantExpr>(VCE->getOperand(0)))
	if (CE->getOpcode() == Instruction::GetElementPtr &&
	CE->getNumOperands() == 3 &&
	CE->getOperand(0)->isNullValue() &&
	CE->getOperand(1)->isNullValue()) {
	Type *Ty =
	cast<PointerType>(CE->getOperand(0)->getType())->getElementType();
	// Ignore vector types here so that ScalarEvolutionExpander doesn't
	// emit getelementptrs that index into vectors.
	if (Ty->isStructTy() \|\| Ty->isArrayTy()) {
	CTy = Ty;
	FieldNo = CE->getOperand(2);
	return true;
	}
	}

	return false;
	}

	//===----------------------------------------------------------------------===//
	// SCEV Utilities
	//===----------------------------------------------------------------------===//

	/// Compare the two values \p LV and \p RV in terms of their "complexity" where
	/// "complexity" is a partial (and somewhat ad-hoc) relation used to order
	/// operands in SCEV expressions. \p EqCache is a set of pairs of values that
	/// have been previously deemed to be "equally complex" by this routine. It is
	/// intended to avoid exponential time complexity in cases like:
	///
	/// %a = f(%x, %y)
	/// %b = f(%a, %a)
	/// %c = f(%b, %b)
	///
	/// %d = f(%x, %y)
	/// %e = f(%d, %d)
	/// %f = f(%e, %e)
	///
	/// CompareValueComplexity(%f, %c)
	///
	/// Since we do not continue running this routine on expression trees once we
	/// have seen unequal values, there is no need to track them in the cache.
	static int
	CompareValueComplexity(SmallSet<std::pair<Value , Value >, 8> &EqCache,
	const LoopInfo const LI, Value LV, Value *RV,
	unsigned Depth) {
	if (Depth > MaxValueCompareDepth \|\| EqCache.count({LV, RV}))
	return 0;

	// Order pointer values after integer values. This helps SCEVExpander form
	// GEPs.
	bool LIsPointer = LV->getType()->isPointerTy(),
	RIsPointer = RV->getType()->isPointerTy();
	if (LIsPointer != RIsPointer)
	return (int)LIsPointer - (int)RIsPointer;

	// Compare getValueID values.
	unsigned LID = LV->getValueID(), RID = RV->getValueID();
	if (LID != RID)
	return (int)LID - (int)RID;

	// Sort arguments by their position.
	if (const auto *LA = dyn_cast<Argument>(LV)) {
	const auto *RA = cast<Argument>(RV);
	unsigned LArgNo = LA->getArgNo(), RArgNo = RA->getArgNo();
	return (int)LArgNo - (int)RArgNo;
	}

	if (const auto *LGV = dyn_cast<GlobalValue>(LV)) {
	const auto *RGV = cast<GlobalValue>(RV);

	const auto IsGVNameSemantic = [&](const GlobalValue *GV) {
	auto LT = GV->getLinkage();
	return !(GlobalValue::isPrivateLinkage(LT) \|\|
	GlobalValue::isInternalLinkage(LT));
	};

	// Use the names to distinguish the two values, but only if the
	// names are semantically important.
	if (IsGVNameSemantic(LGV) && IsGVNameSemantic(RGV))
	return LGV->getName().compare(RGV->getName());
	}

	// For instructions, compare their loop depth, and their operand count. This
	// is pretty loose.
	if (const auto *LInst = dyn_cast<Instruction>(LV)) {
	const auto *RInst = cast<Instruction>(RV);

	// Compare loop depths.
	const BasicBlock *LParent = LInst->getParent(),
	*RParent = RInst->getParent();
	if (LParent != RParent) {
	unsigned LDepth = LI->getLoopDepth(LParent),
	RDepth = LI->getLoopDepth(RParent);
	if (LDepth != RDepth)
	return (int)LDepth - (int)RDepth;
	}

	// Compare the number of operands.
	unsigned LNumOps = LInst->getNumOperands(),
	RNumOps = RInst->getNumOperands();
	if (LNumOps != RNumOps)
	return (int)LNumOps - (int)RNumOps;

	for (unsigned Idx : seq(0u, LNumOps)) {
	int Result =
	CompareValueComplexity(EqCache, LI, LInst->getOperand(Idx),
	RInst->getOperand(Idx), Depth + 1);
	if (Result != 0)
	return Result;
	}
	}

	EqCache.insert({LV, RV});
	return 0;
	}

	// Return negative, zero, or positive, if LHS is less than, equal to, or greater
	// than RHS, respectively. A three-way result allows recursive comparisons to be
	// more efficient.
	static int CompareSCEVComplexity(
	SmallSet<std::pair<const SCEV , const SCEV >, 8> &EqCacheSCEV,
	const LoopInfo const LI, const SCEV LHS, const SCEV *RHS,
	DominatorTree &DT, unsigned Depth = 0) {
	// Fast-path: SCEVs are uniqued so we can do a quick equality check.
	if (LHS == RHS)
	return 0;

	// Primarily, sort the SCEVs by their getSCEVType().
	unsigned LType = LHS->getSCEVType(), RType = RHS->getSCEVType();
	if (LType != RType)
	return (int)LType - (int)RType;

	if (Depth > MaxSCEVCompareDepth \|\| EqCacheSCEV.count({LHS, RHS}))
	return 0;
	// Aside from the getSCEVType() ordering, the particular ordering
	// isn't very important except that it's beneficial to be consistent,
	// so that (a + b) and (b + a) don't end up as different expressions.
	switch (static_cast<SCEVTypes>(LType)) {
	case scUnknown: {
	const SCEVUnknown *LU = cast<SCEVUnknown>(LHS);
	const SCEVUnknown *RU = cast<SCEVUnknown>(RHS);

	SmallSet<std::pair<Value , Value >, 8> EqCache;
	int X = CompareValueComplexity(EqCache, LI, LU->getValue(), RU->getValue(),
	Depth + 1);
	if (X == 0)
	EqCacheSCEV.insert({LHS, RHS});
	return X;
	}

	case scConstant: {
	const SCEVConstant *LC = cast<SCEVConstant>(LHS);
	const SCEVConstant *RC = cast<SCEVConstant>(RHS);

	// Compare constant values.
	const APInt &LA = LC->getAPInt();
	const APInt &RA = RC->getAPInt();
	unsigned LBitWidth = LA.getBitWidth(), RBitWidth = RA.getBitWidth();
	if (LBitWidth != RBitWidth)
	return (int)LBitWidth - (int)RBitWidth;
	return LA.ult(RA) ? -1 : 1;
	}

	case scAddRecExpr: {
	const SCEVAddRecExpr *LA = cast<SCEVAddRecExpr>(LHS);
	const SCEVAddRecExpr *RA = cast<SCEVAddRecExpr>(RHS);

	// There is always a dominance between two recs that are used by one SCEV,
	// so we can safely sort recs by loop header dominance. We require such
	// order in getAddExpr.
	const Loop LLoop = LA->getLoop(), RLoop = RA->getLoop();
	if (LLoop != RLoop) {
	const BasicBlock LHead = LLoop->getHeader(), RHead = RLoop->getHeader();
	assert(LHead != RHead && "Two loops share the same header?");
	if (DT.dominates(LHead, RHead))
	return 1;
	else
	assert(DT.dominates(RHead, LHead) &&
	"No dominance between recurrences used by one SCEV?");
	return -1;
	}

	// Addrec complexity grows with operand count.
	unsigned LNumOps = LA->getNumOperands(), RNumOps = RA->getNumOperands();
	if (LNumOps != RNumOps)
	return (int)LNumOps - (int)RNumOps;

	// Lexicographically compare.
	for (unsigned i = 0; i != LNumOps; ++i) {
	int X = CompareSCEVComplexity(EqCacheSCEV, LI, LA->getOperand(i),
	RA->getOperand(i), DT, Depth + 1);
	if (X != 0)
	return X;
	}
	EqCacheSCEV.insert({LHS, RHS});
	return 0;
	}

	case scAddExpr:
	case scMulExpr:
	case scSMaxExpr:
	case scUMaxExpr: {
	const SCEVNAryExpr *LC = cast<SCEVNAryExpr>(LHS);
	const SCEVNAryExpr *RC = cast<SCEVNAryExpr>(RHS);

	// Lexicographically compare n-ary expressions.
	unsigned LNumOps = LC->getNumOperands(), RNumOps = RC->getNumOperands();
	if (LNumOps != RNumOps)
	return (int)LNumOps - (int)RNumOps;

	for (unsigned i = 0; i != LNumOps; ++i) {
	if (i >= RNumOps)
	return 1;
	int X = CompareSCEVComplexity(EqCacheSCEV, LI, LC->getOperand(i),
	RC->getOperand(i), DT, Depth + 1);
	if (X != 0)
	return X;
	}
	EqCacheSCEV.insert({LHS, RHS});
	return 0;
	}

	case scUDivExpr: {
	const SCEVUDivExpr *LC = cast<SCEVUDivExpr>(LHS);
	const SCEVUDivExpr *RC = cast<SCEVUDivExpr>(RHS);

	// Lexicographically compare udiv expressions.
	int X = CompareSCEVComplexity(EqCacheSCEV, LI, LC->getLHS(), RC->getLHS(),
	DT, Depth + 1);
	if (X != 0)
	return X;
	X = CompareSCEVComplexity(EqCacheSCEV, LI, LC->getRHS(), RC->getRHS(), DT,
	Depth + 1);
	if (X == 0)
	EqCacheSCEV.insert({LHS, RHS});
	return X;
	}

	case scTruncate:
	case scZeroExtend:
	case scSignExtend: {
	const SCEVCastExpr *LC = cast<SCEVCastExpr>(LHS);
	const SCEVCastExpr *RC = cast<SCEVCastExpr>(RHS);

	// Compare cast expressions by operand.
	int X = CompareSCEVComplexity(EqCacheSCEV, LI, LC->getOperand(),
	RC->getOperand(), DT, Depth + 1);
	if (X == 0)
	EqCacheSCEV.insert({LHS, RHS});
	return X;
	}

	case scCouldNotCompute:
	llvm_unreachable("Attempt to use a SCEVCouldNotCompute object!");
	}
	llvm_unreachable("Unknown SCEV kind!");
	}

	/// Given a list of SCEV objects, order them by their complexity, and group
	/// objects of the same complexity together by value. When this routine is
	/// finished, we know that any duplicates in the vector are consecutive and that
	/// complexity is monotonically increasing.
	///
	/// Note that we go take special precautions to ensure that we get deterministic
	/// results from this routine. In other words, we don't want the results of
	/// this to depend on where the addresses of various SCEV objects happened to
	/// land in memory.
	///
	static void GroupByComplexity(SmallVectorImpl<const SCEV *> &Ops,
	LoopInfo *LI, DominatorTree &DT) {
	if (Ops.size() < 2) return; // Noop

	SmallSet<std::pair<const SCEV , const SCEV >, 8> EqCache;
	if (Ops.size() == 2) {
	// This is the common case, which also happens to be trivially simple.
	// Special case it.
	const SCEV &LHS = Ops[0], &RHS = Ops[1];
	if (CompareSCEVComplexity(EqCache, LI, RHS, LHS, DT) < 0)
	std::swap(LHS, RHS);
	return;
	}

	// Do the rough sort by complexity.
	std::stable_sort(Ops.begin(), Ops.end(),
	[&EqCache, LI, &DT](const SCEV LHS, const SCEV RHS) {
	return
	CompareSCEVComplexity(EqCache, LI, LHS, RHS, DT) < 0;
	});

	// Now that we are sorted by complexity, group elements of the same
	// complexity. Note that this is, at worst, N^2, but the vector is likely to
	// be extremely short in practice. Note that we take this approach because we
	// do not want to depend on the addresses of the objects we are grouping.
	for (unsigned i = 0, e = Ops.size(); i != e-2; ++i) {
	const SCEV *S = Ops[i];
	unsigned Complexity = S->getSCEVType();

	// If there are any objects of the same complexity and same value as this
	// one, group them.
	for (unsigned j = i+1; j != e && Ops[j]->getSCEVType() == Complexity; ++j) {
	if (Ops[j] == S) { // Found a duplicate.
	// Move it to immediately after i'th element.
	std::swap(Ops[i+1], Ops[j]);
	++i; // no need to rescan it.
	if (i == e-2) return; // Done!
	}
	}
	}
	}

	// Returns the size of the SCEV S.
	static inline int sizeOfSCEV(const SCEV *S) {
	struct FindSCEVSize {
	int Size;
	FindSCEVSize() : Size(0) {}

	bool follow(const SCEV *S) {
	++Size;
	// Keep looking at all operands of S.
	return true;
	}
	bool isDone() const {
	return false;
	}
	};

	FindSCEVSize F;
	SCEVTraversal<FindSCEVSize> ST(F);
	ST.visitAll(S);
	return F.Size;
	}

	namespace {

	struct SCEVDivision : public SCEVVisitor<SCEVDivision, void> {
	public:
	// Computes the Quotient and Remainder of the division of Numerator by
	// Denominator.
	static void divide(ScalarEvolution &SE, const SCEV *Numerator,
	const SCEV Denominator, const SCEV *Quotient,
	const SCEV **Remainder) {
	assert(Numerator && Denominator && "Uninitialized SCEV");

	SCEVDivision D(SE, Numerator, Denominator);

	// Check for the trivial case here to avoid having to check for it in the
	// rest of the code.
	if (Numerator == Denominator) {
	*Quotient = D.One;
	*Remainder = D.Zero;
	return;
	}

	if (Numerator->isZero()) {
	*Quotient = D.Zero;
	*Remainder = D.Zero;
	return;
	}

	// A simple case when N/1. The quotient is N.
	if (Denominator->isOne()) {
	*Quotient = Numerator;
	*Remainder = D.Zero;
	return;
	}

	// Split the Denominator when it is a product.
	if (const SCEVMulExpr *T = dyn_cast<SCEVMulExpr>(Denominator)) {
	const SCEV Q, R;
	*Quotient = Numerator;
	for (const SCEV *Op : T->operands()) {
	divide(SE, *Quotient, Op, &Q, &R);
	*Quotient = Q;

	// Bail out when the Numerator is not divisible by one of the terms of
	// the Denominator.
	if (!R->isZero()) {
	*Quotient = D.Zero;
	*Remainder = Numerator;
	return;
	}
	}
	*Remainder = D.Zero;
	return;
	}

	D.visit(Numerator);
	*Quotient = D.Quotient;
	*Remainder = D.Remainder;
	}

	// Except in the trivial case described above, we do not know how to divide
	// Expr by Denominator for the following functions with empty implementation.
	void visitTruncateExpr(const SCEVTruncateExpr *Numerator) {}
	void visitZeroExtendExpr(const SCEVZeroExtendExpr *Numerator) {}
	void visitSignExtendExpr(const SCEVSignExtendExpr *Numerator) {}
	void visitUDivExpr(const SCEVUDivExpr *Numerator) {}
	void visitSMaxExpr(const SCEVSMaxExpr *Numerator) {}
	void visitUMaxExpr(const SCEVUMaxExpr *Numerator) {}
	void visitUnknown(const SCEVUnknown *Numerator) {}
	void visitCouldNotCompute(const SCEVCouldNotCompute *Numerator) {}

	void visitConstant(const SCEVConstant *Numerator) {
	if (const SCEVConstant *D = dyn_cast<SCEVConstant>(Denominator)) {
	APInt NumeratorVal = Numerator->getAPInt();
	APInt DenominatorVal = D->getAPInt();
	uint32_t NumeratorBW = NumeratorVal.getBitWidth();
	uint32_t DenominatorBW = DenominatorVal.getBitWidth();

	if (NumeratorBW > DenominatorBW)
	DenominatorVal = DenominatorVal.sext(NumeratorBW);
	else if (NumeratorBW < DenominatorBW)
	NumeratorVal = NumeratorVal.sext(DenominatorBW);

	APInt QuotientVal(NumeratorVal.getBitWidth(), 0);
	APInt RemainderVal(NumeratorVal.getBitWidth(), 0);
	APInt::sdivrem(NumeratorVal, DenominatorVal, QuotientVal, RemainderVal);
	Quotient = SE.getConstant(QuotientVal);
	Remainder = SE.getConstant(RemainderVal);
	return;
	}
	}

	void visitAddRecExpr(const SCEVAddRecExpr *Numerator) {
	const SCEV StartQ, StartR, StepQ, StepR;
	if (!Numerator->isAffine())
	return cannotDivide(Numerator);
	divide(SE, Numerator->getStart(), Denominator, &StartQ, &StartR);
	divide(SE, Numerator->getStepRecurrence(SE), Denominator, &StepQ, &StepR);
	// Bail out if the types do not match.
	Type *Ty = Denominator->getType();
	if (Ty != StartQ->getType() \|\| Ty != StartR->getType() \|\|
	Ty != StepQ->getType() \|\| Ty != StepR->getType())
	return cannotDivide(Numerator);
	Quotient = SE.getAddRecExpr(StartQ, StepQ, Numerator->getLoop(),
	Numerator->getNoWrapFlags());
	Remainder = SE.getAddRecExpr(StartR, StepR, Numerator->getLoop(),
	Numerator->getNoWrapFlags());
	}

	void visitAddExpr(const SCEVAddExpr *Numerator) {
	SmallVector<const SCEV *, 2> Qs, Rs;
	Type *Ty = Denominator->getType();

	for (const SCEV *Op : Numerator->operands()) {
	const SCEV Q, R;
	divide(SE, Op, Denominator, &Q, &R);

	// Bail out if types do not match.
	if (Ty != Q->getType() \|\| Ty != R->getType())
	return cannotDivide(Numerator);

	Qs.push_back(Q);
	Rs.push_back(R);
	}

	if (Qs.size() == 1) {
	Quotient = Qs[0];
	Remainder = Rs[0];
	return;
	}

	Quotient = SE.getAddExpr(Qs);
	Remainder = SE.getAddExpr(Rs);
	}

	void visitMulExpr(const SCEVMulExpr *Numerator) {
	SmallVector<const SCEV *, 2> Qs;
	Type *Ty = Denominator->getType();

	bool FoundDenominatorTerm = false;
	for (const SCEV *Op : Numerator->operands()) {
	// Bail out if types do not match.
	if (Ty != Op->getType())
	return cannotDivide(Numerator);

	if (FoundDenominatorTerm) {
	Qs.push_back(Op);
	continue;
	}

	// Check whether Denominator divides one of the product operands.
	const SCEV Q, R;
	divide(SE, Op, Denominator, &Q, &R);
	if (!R->isZero()) {
	Qs.push_back(Op);
	continue;
	}

	// Bail out if types do not match.
	if (Ty != Q->getType())
	return cannotDivide(Numerator);

	FoundDenominatorTerm = true;
	Qs.push_back(Q);
	}

	if (FoundDenominatorTerm) {
	Remainder = Zero;
	if (Qs.size() == 1)
	Quotient = Qs[0];
	else
	Quotient = SE.getMulExpr(Qs);
	return;
	}

	if (!isa<SCEVUnknown>(Denominator))
	return cannotDivide(Numerator);

	// The Remainder is obtained by replacing Denominator by 0 in Numerator.
	ValueToValueMap RewriteMap;
	RewriteMap[cast<SCEVUnknown>(Denominator)->getValue()] =
	cast<SCEVConstant>(Zero)->getValue();
	Remainder = SCEVParameterRewriter::rewrite(Numerator, SE, RewriteMap, true);

	if (Remainder->isZero()) {
	// The Quotient is obtained by replacing Denominator by 1 in Numerator.
	RewriteMap[cast<SCEVUnknown>(Denominator)->getValue()] =
	cast<SCEVConstant>(One)->getValue();
	Quotient =
	SCEVParameterRewriter::rewrite(Numerator, SE, RewriteMap, true);
	return;
	}

	// Quotient is (Numerator - Remainder) divided by Denominator.
	const SCEV Q, R;
	const SCEV *Diff = SE.getMinusSCEV(Numerator, Remainder);
	// This SCEV does not seem to simplify: fail the division here.
	if (sizeOfSCEV(Diff) > sizeOfSCEV(Numerator))
	return cannotDivide(Numerator);
	divide(SE, Diff, Denominator, &Q, &R);
	if (R != Zero)
	return cannotDivide(Numerator);
	Quotient = Q;
	}

	private:
	SCEVDivision(ScalarEvolution &S, const SCEV *Numerator,
	const SCEV *Denominator)
	: SE(S), Denominator(Denominator) {
	Zero = SE.getZero(Denominator->getType());
	One = SE.getOne(Denominator->getType());

	// We generally do not know how to divide Expr by Denominator. We
	// initialize the division to a "cannot divide" state to simplify the rest
	// of the code.
	cannotDivide(Numerator);
	}

	// Convenience function for giving up on the division. We set the quotient to
	// be equal to zero and the remainder to be equal to the numerator.
	void cannotDivide(const SCEV *Numerator) {
	Quotient = Zero;
	Remainder = Numerator;
	}

	ScalarEvolution &SE;
	const SCEV Denominator, Quotient, Remainder, Zero, *One;
	};

	}

	//===----------------------------------------------------------------------===//
	// Simple SCEV method implementations
	//===----------------------------------------------------------------------===//

	/// Compute BC(It, K). The result has width W. Assume, K > 0.
	static const SCEV BinomialCoefficient(const SCEV It, unsigned K,
	ScalarEvolution &SE,
	Type *ResultTy) {
	// Handle the simplest case efficiently.
	if (K == 1)
	return SE.getTruncateOrZeroExtend(It, ResultTy);

	// We are using the following formula for BC(It, K):
	//
	// BC(It, K) = (It * (It - 1) * ... * (It - K + 1)) / K!
	//
	// Suppose, W is the bitwidth of the return value. We must be prepared for
	// overflow. Hence, we must assure that the result of our computation is
	// equal to the accurate one modulo 2^W. Unfortunately, division isn't
	// safe in modular arithmetic.
	//
	// However, this code doesn't use exactly that formula; the formula it uses
	// is something like the following, where T is the number of factors of 2 in
	// K! (i.e. trailing zeros in the binary representation of K!), and ^ is
	// exponentiation:
	//
	// BC(It, K) = (It * (It - 1) * ... * (It - K + 1)) / 2^T / (K! / 2^T)
	//
	// This formula is trivially equivalent to the previous formula. However,
	// this formula can be implemented much more efficiently. The trick is that
	// K! / 2^T is odd, and exact division by an odd number is safe in modular
	// arithmetic. To do exact division in modular arithmetic, all we have
	// to do is multiply by the inverse. Therefore, this step can be done at
	// width W.
	//
	// The next issue is how to safely do the division by 2^T. The way this
	// is done is by doing the multiplication step at a width of at least W + T
	// bits. This way, the bottom W+T bits of the product are accurate. Then,
	// when we perform the division by 2^T (which is equivalent to a right shift
	// by T), the bottom W bits are accurate. Extra bits are okay; they'll get
	// truncated out after the division by 2^T.
	//
	// In comparison to just directly using the first formula, this technique
	// is much more efficient; using the first formula requires W * K bits,
	// but this formula less than W + K bits. Also, the first formula requires
	// a division step, whereas this formula only requires multiplies and shifts.
	//
	// It doesn't matter whether the subtraction step is done in the calculation
	// width or the input iteration count's width; if the subtraction overflows,
	// the result must be zero anyway. We prefer here to do it in the width of
	// the induction variable because it helps a lot for certain cases; CodeGen
	// isn't smart enough to ignore the overflow, which leads to much less
	// efficient code if the width of the subtraction is wider than the native
	// register width.
	//
	// (It's possible to not widen at all by pulling out factors of 2 before
	// the multiplication; for example, K=2 can be calculated as
	// It/2(It+(ItINT_MIN/INT_MIN)+-1). However, it requires
	// extra arithmetic, so it's not an obvious win, and it gets
	// much more complicated for K > 3.)

	// Protection from insane SCEVs; this bound is conservative,
	// but it probably doesn't matter.
	if (K > 1000)
	return SE.getCouldNotCompute();

	unsigned W = SE.getTypeSizeInBits(ResultTy);

	// Calculate K! / 2^T and T; we divide out the factors of two before
	// multiplying for calculating K! / 2^T to avoid overflow.
	// Other overflow doesn't matter because we only care about the bottom
	// W bits of the result.
	APInt OddFactorial(W, 1);
	unsigned T = 1;
	for (unsigned i = 3; i <= K; ++i) {
	APInt Mult(W, i);
	unsigned TwoFactors = Mult.countTrailingZeros();
	T += TwoFactors;
	Mult.lshrInPlace(TwoFactors);
	OddFactorial *= Mult;
	}

	// We need at least W + T bits for the multiplication step
	unsigned CalculationBits = W + T;

	// Calculate 2^T, at width T+W.
	APInt DivFactor = APInt::getOneBitSet(CalculationBits, T);

	// Calculate the multiplicative inverse of K! / 2^T;
	// this multiplication factor will perform the exact division by
	// K! / 2^T.
	APInt Mod = APInt::getSignedMinValue(W+1);
	APInt MultiplyFactor = OddFactorial.zext(W+1);
	MultiplyFactor = MultiplyFactor.multiplicativeInverse(Mod);
	MultiplyFactor = MultiplyFactor.trunc(W);

	// Calculate the product, at width T+W
	IntegerType *CalculationTy = IntegerType::get(SE.getContext(),
	CalculationBits);
	const SCEV *Dividend = SE.getTruncateOrZeroExtend(It, CalculationTy);
	for (unsigned i = 1; i != K; ++i) {
	const SCEV *S = SE.getMinusSCEV(It, SE.getConstant(It->getType(), i));
	Dividend = SE.getMulExpr(Dividend,
	SE.getTruncateOrZeroExtend(S, CalculationTy));
	}

	// Divide by 2^T
	const SCEV *DivResult = SE.getUDivExpr(Dividend, SE.getConstant(DivFactor));

	// Truncate the result, and divide by K! / 2^T.

	return SE.getMulExpr(SE.getConstant(MultiplyFactor),
	SE.getTruncateOrZeroExtend(DivResult, ResultTy));
	}

	/// Return the value of this chain of recurrences at the specified iteration
	/// number. We can evaluate this recurrence by multiplying each element in the
	/// chain by the binomial coefficient corresponding to it. In other words, we
	/// can evaluate {A,+,B,+,C,+,D} as:
	///
	/// ABC(It, 0) + BBC(It, 1) + CBC(It, 2) + DBC(It, 3)
	///
	/// where BC(It, k) stands for binomial coefficient.
	///
	const SCEV SCEVAddRecExpr::evaluateAtIteration(const SCEV It,
	ScalarEvolution &SE) const {
	const SCEV *Result = getStart();
	for (unsigned i = 1, e = getNumOperands(); i != e; ++i) {
	// The computation is correct in the face of overflow provided that the
	// multiplication is performed _after_ the evaluation of the binomial
	// coefficient.
	const SCEV *Coeff = BinomialCoefficient(It, i, SE, getType());
	if (isa<SCEVCouldNotCompute>(Coeff))
	return Coeff;

	Result = SE.getAddExpr(Result, SE.getMulExpr(getOperand(i), Coeff));
	}
	return Result;
	}

	//===----------------------------------------------------------------------===//
	// SCEV Expression folder implementations
	//===----------------------------------------------------------------------===//

	const SCEV ScalarEvolution::getTruncateExpr(const SCEV Op,
	Type *Ty) {
	assert(getTypeSizeInBits(Op->getType()) > getTypeSizeInBits(Ty) &&
	"This is not a truncating conversion!");
	assert(isSCEVable(Ty) &&
	"This is not a conversion to a SCEVable type!");
	Ty = getEffectiveSCEVType(Ty);

	FoldingSetNodeID ID;
	ID.AddInteger(scTruncate);
	ID.AddPointer(Op);
	ID.AddPointer(Ty);
	void *IP = nullptr;
	if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S;

	// Fold if the operand is constant.
	if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(Op))
	return getConstant(
	cast<ConstantInt>(ConstantExpr::getTrunc(SC->getValue(), Ty)));

	// trunc(trunc(x)) --> trunc(x)
	if (const SCEVTruncateExpr *ST = dyn_cast<SCEVTruncateExpr>(Op))
	return getTruncateExpr(ST->getOperand(), Ty);

	// trunc(sext(x)) --> sext(x) if widening or trunc(x) if narrowing
	if (const SCEVSignExtendExpr *SS = dyn_cast<SCEVSignExtendExpr>(Op))
	return getTruncateOrSignExtend(SS->getOperand(), Ty);

	// trunc(zext(x)) --> zext(x) if widening or trunc(x) if narrowing
	if (const SCEVZeroExtendExpr *SZ = dyn_cast<SCEVZeroExtendExpr>(Op))
	return getTruncateOrZeroExtend(SZ->getOperand(), Ty);

	// trunc(x1+x2+...+xN) --> trunc(x1)+trunc(x2)+...+trunc(xN) if we can
	// eliminate all the truncates, or we replace other casts with truncates.
	if (const SCEVAddExpr *SA = dyn_cast<SCEVAddExpr>(Op)) {
	SmallVector<const SCEV *, 4> Operands;
	bool hasTrunc = false;
	for (unsigned i = 0, e = SA->getNumOperands(); i != e && !hasTrunc; ++i) {
	const SCEV *S = getTruncateExpr(SA->getOperand(i), Ty);
	if (!isa<SCEVCastExpr>(SA->getOperand(i)))
	hasTrunc = isa<SCEVTruncateExpr>(S);
	Operands.push_back(S);
	}
	if (!hasTrunc)
	return getAddExpr(Operands);
	UniqueSCEVs.FindNodeOrInsertPos(ID, IP); // Mutates IP, returns NULL.
	}

	// trunc(x1x2...xN) --> trunc(x1)trunc(x2)...trunc(xN) if we can
	// eliminate all the truncates, or we replace other casts with truncates.
	if (const SCEVMulExpr *SM = dyn_cast<SCEVMulExpr>(Op)) {
	SmallVector<const SCEV *, 4> Operands;
	bool hasTrunc = false;
	for (unsigned i = 0, e = SM->getNumOperands(); i != e && !hasTrunc; ++i) {
	const SCEV *S = getTruncateExpr(SM->getOperand(i), Ty);
	if (!isa<SCEVCastExpr>(SM->getOperand(i)))
	hasTrunc = isa<SCEVTruncateExpr>(S);
	Operands.push_back(S);
	}
	if (!hasTrunc)
	return getMulExpr(Operands);
	UniqueSCEVs.FindNodeOrInsertPos(ID, IP); // Mutates IP, returns NULL.
	}

	// If the input value is a chrec scev, truncate the chrec's operands.
	if (const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(Op)) {
	SmallVector<const SCEV *, 4> Operands;
	for (const SCEV *Op : AddRec->operands())
	Operands.push_back(getTruncateExpr(Op, Ty));
	return getAddRecExpr(Operands, AddRec->getLoop(), SCEV::FlagAnyWrap);
	}

	// The cast wasn't folded; create an explicit cast node. We can reuse
	// the existing insert position since if we get here, we won't have
	// made any changes which would invalidate it.
	SCEV *S = new (SCEVAllocator) SCEVTruncateExpr(ID.Intern(SCEVAllocator),
	Op, Ty);
	UniqueSCEVs.InsertNode(S, IP);
	return S;
	}

	// Get the limit of a recurrence such that incrementing by Step cannot cause
	// signed overflow as long as the value of the recurrence within the
	// loop does not exceed this limit before incrementing.
	static const SCEV getSignedOverflowLimitForStep(const SCEV Step,
	ICmpInst::Predicate *Pred,
	ScalarEvolution *SE) {
	unsigned BitWidth = SE->getTypeSizeInBits(Step->getType());
	if (SE->isKnownPositive(Step)) {
	*Pred = ICmpInst::ICMP_SLT;
	return SE->getConstant(APInt::getSignedMinValue(BitWidth) -
	SE->getSignedRangeMax(Step));
	}
	if (SE->isKnownNegative(Step)) {
	*Pred = ICmpInst::ICMP_SGT;
	return SE->getConstant(APInt::getSignedMaxValue(BitWidth) -
	SE->getSignedRangeMin(Step));
	}
	return nullptr;
	}

	// Get the limit of a recurrence such that incrementing by Step cannot cause
	// unsigned overflow as long as the value of the recurrence within the loop does
	// not exceed this limit before incrementing.
	static const SCEV getUnsignedOverflowLimitForStep(const SCEV Step,
	ICmpInst::Predicate *Pred,
	ScalarEvolution *SE) {
	unsigned BitWidth = SE->getTypeSizeInBits(Step->getType());
	*Pred = ICmpInst::ICMP_ULT;

	return SE->getConstant(APInt::getMinValue(BitWidth) -
	SE->getUnsignedRangeMax(Step));
	}

	namespace {

	struct ExtendOpTraitsBase {
	typedef const SCEV (ScalarEvolution::GetExtendExprTy)(const SCEV , Type ,
	unsigned);
	};

	// Used to make code generic over signed and unsigned overflow.
	template <typename ExtendOp> struct ExtendOpTraits {
	// Members present:
	//
	// static const SCEV::NoWrapFlags WrapType;
	//
	// static const ExtendOpTraitsBase::GetExtendExprTy GetExtendExpr;
	//
	// static const SCEV getOverflowLimitForStep(const SCEV Step,
	// ICmpInst::Predicate *Pred,
	// ScalarEvolution *SE);
	};

	template <>
	struct ExtendOpTraits<SCEVSignExtendExpr> : public ExtendOpTraitsBase {
	static const SCEV::NoWrapFlags WrapType = SCEV::FlagNSW;

	static const GetExtendExprTy GetExtendExpr;

	static const SCEV getOverflowLimitForStep(const SCEV Step,
	ICmpInst::Predicate *Pred,
	ScalarEvolution *SE) {
	return getSignedOverflowLimitForStep(Step, Pred, SE);
	}
	};

	const ExtendOpTraitsBase::GetExtendExprTy ExtendOpTraits<
	SCEVSignExtendExpr>::GetExtendExpr = &ScalarEvolution::getSignExtendExpr;

	template <>
	struct ExtendOpTraits<SCEVZeroExtendExpr> : public ExtendOpTraitsBase {
	static const SCEV::NoWrapFlags WrapType = SCEV::FlagNUW;

	static const GetExtendExprTy GetExtendExpr;

	static const SCEV getOverflowLimitForStep(const SCEV Step,
	ICmpInst::Predicate *Pred,
	ScalarEvolution *SE) {
	return getUnsignedOverflowLimitForStep(Step, Pred, SE);
	}
	};

	const ExtendOpTraitsBase::GetExtendExprTy ExtendOpTraits<
	SCEVZeroExtendExpr>::GetExtendExpr = &ScalarEvolution::getZeroExtendExpr;
	}

	// The recurrence AR has been shown to have no signed/unsigned wrap or something
	// close to it. Typically, if we can prove NSW/NUW for AR, then we can just as
	// easily prove NSW/NUW for its preincrement or postincrement sibling. This
	// allows normalizing a sign/zero extended AddRec as such: {sext/zext(Step +
	// Start),+,Step} => {(Step + sext/zext(Start),+,Step} As a result, the
	// expression "Step + sext/zext(PreIncAR)" is congruent with
	// "sext/zext(PostIncAR)"
	template <typename ExtendOpTy>
	static const SCEV getPreStartForExtend(const SCEVAddRecExpr AR, Type *Ty,
	ScalarEvolution *SE, unsigned Depth) {
	auto WrapType = ExtendOpTraits<ExtendOpTy>::WrapType;
	auto GetExtendExpr = ExtendOpTraits<ExtendOpTy>::GetExtendExpr;

	const Loop *L = AR->getLoop();
	const SCEV *Start = AR->getStart();
	const SCEV Step = AR->getStepRecurrence(SE);

	// Check for a simple looking step prior to loop entry.
	const SCEVAddExpr *SA = dyn_cast<SCEVAddExpr>(Start);
	if (!SA)
	return nullptr;

	// Create an AddExpr for "PreStart" after subtracting Step. Full SCEV
	// subtraction is expensive. For this purpose, perform a quick and dirty
	// difference, by checking for Step in the operand list.
	SmallVector<const SCEV *, 4> DiffOps;
	for (const SCEV *Op : SA->operands())
	if (Op != Step)
	DiffOps.push_back(Op);

	if (DiffOps.size() == SA->getNumOperands())
	return nullptr;

	// Try to prove `WrapType` (SCEV::FlagNSW or SCEV::FlagNUW) on `PreStart` +
	// `Step`:

	// 1. NSW/NUW flags on the step increment.
	auto PreStartFlags =
	ScalarEvolution::maskFlags(SA->getNoWrapFlags(), SCEV::FlagNUW);
	const SCEV *PreStart = SE->getAddExpr(DiffOps, PreStartFlags);
	const SCEVAddRecExpr *PreAR = dyn_cast<SCEVAddRecExpr>(
	SE->getAddRecExpr(PreStart, Step, L, SCEV::FlagAnyWrap));

	// "{S,+,X} is <nsw>/<nuw>" and "the backedge is taken at least once" implies
	// "S+X does not sign/unsign-overflow".
	//

	const SCEV *BECount = SE->getBackedgeTakenCount(L);
	if (PreAR && PreAR->getNoWrapFlags(WrapType) &&
	!isa<SCEVCouldNotCompute>(BECount) && SE->isKnownPositive(BECount))
	return PreStart;

	// 2. Direct overflow check on the step operation's expression.
	unsigned BitWidth = SE->getTypeSizeInBits(AR->getType());
	Type WideTy = IntegerType::get(SE->getContext(), BitWidth 2);
	const SCEV *OperandExtendedStart =
	SE->getAddExpr((SE->*GetExtendExpr)(PreStart, WideTy, Depth),
	(SE->*GetExtendExpr)(Step, WideTy, Depth));
	if ((SE->*GetExtendExpr)(Start, WideTy, Depth) == OperandExtendedStart) {
	if (PreAR && AR->getNoWrapFlags(WrapType)) {
	// If we know `AR` == {`PreStart`+`Step`,+,`Step`} is `WrapType` (FlagNSW
	// or FlagNUW) and that `PreStart` + `Step` is `WrapType` too, then
	// `PreAR` == {`PreStart`,+,`Step`} is also `WrapType`. Cache this fact.
	const_cast<SCEVAddRecExpr *>(PreAR)->setNoWrapFlags(WrapType);
	}
	return PreStart;
	}

	// 3. Loop precondition.
	ICmpInst::Predicate Pred;
	const SCEV *OverflowLimit =
	ExtendOpTraits<ExtendOpTy>::getOverflowLimitForStep(Step, &Pred, SE);

	if (OverflowLimit &&
	SE->isLoopEntryGuardedByCond(L, Pred, PreStart, OverflowLimit))
	return PreStart;

	return nullptr;
	}

	// Get the normalized zero or sign extended expression for this AddRec's Start.
	template <typename ExtendOpTy>
	static const SCEV getExtendAddRecStart(const SCEVAddRecExpr AR, Type *Ty,
	ScalarEvolution *SE,
	unsigned Depth) {
	auto GetExtendExpr = ExtendOpTraits<ExtendOpTy>::GetExtendExpr;

	const SCEV *PreStart = getPreStartForExtend<ExtendOpTy>(AR, Ty, SE, Depth);
	if (!PreStart)
	return (SE->*GetExtendExpr)(AR->getStart(), Ty, Depth);

	return SE->getAddExpr((SE->GetExtendExpr)(AR->getStepRecurrence(SE), Ty,
	Depth),
	(SE->*GetExtendExpr)(PreStart, Ty, Depth));
	}

	// Try to prove away overflow by looking at "nearby" add recurrences. A
	// motivating example for this rule: if we know `{0,+,4}` is `ult` `-1` and it
	// does not itself wrap then we can conclude that `{1,+,4}` is `nuw`.
	//
	// Formally:
	//
	// {S,+,X} == {S-T,+,X} + T
	// => Ext({S,+,X}) == Ext({S-T,+,X} + T)
	//
	// If ({S-T,+,X} + T) does not overflow ... (1)
	//
	// RHS == Ext({S-T,+,X} + T) == Ext({S-T,+,X}) + Ext(T)
	//
	// If {S-T,+,X} does not overflow ... (2)
	//
	// RHS == Ext({S-T,+,X}) + Ext(T) == {Ext(S-T),+,Ext(X)} + Ext(T)
	// == {Ext(S-T)+Ext(T),+,Ext(X)}
	//
	// If (S-T)+T does not overflow ... (3)
	//
	// RHS == {Ext(S-T)+Ext(T),+,Ext(X)} == {Ext(S-T+T),+,Ext(X)}
	// == {Ext(S),+,Ext(X)} == LHS
	//
	// Thus, if (1), (2) and (3) are true for some T, then
	// Ext({S,+,X}) == {Ext(S),+,Ext(X)}
	//
	// (3) is implied by (1) -- "(S-T)+T does not overflow" is simply "({S-T,+,X}+T)
	// does not overflow" restricted to the 0th iteration. Therefore we only need
	// to check for (1) and (2).
	//
	// In the current context, S is `Start`, X is `Step`, Ext is `ExtendOpTy` and T
	// is `Delta` (defined below).
	//
	template <typename ExtendOpTy>
	bool ScalarEvolution::proveNoWrapByVaryingStart(const SCEV *Start,
	const SCEV *Step,
	const Loop *L) {
	auto WrapType = ExtendOpTraits<ExtendOpTy>::WrapType;

	// We restrict `Start` to a constant to prevent SCEV from spending too much
	// time here. It is correct (but more expensive) to continue with a
	// non-constant `Start` and do a general SCEV subtraction to compute
	// `PreStart` below.
	//
	const SCEVConstant *StartC = dyn_cast<SCEVConstant>(Start);
	if (!StartC)
	return false;

	APInt StartAI = StartC->getAPInt();

	for (unsigned Delta : {-2, -1, 1, 2}) {
	const SCEV *PreStart = getConstant(StartAI - Delta);

	FoldingSetNodeID ID;
	ID.AddInteger(scAddRecExpr);
	ID.AddPointer(PreStart);
	ID.AddPointer(Step);
	ID.AddPointer(L);
	void *IP = nullptr;
	const auto *PreAR =
	static_cast<SCEVAddRecExpr *>(UniqueSCEVs.FindNodeOrInsertPos(ID, IP));

	// Give up if we don't already have the add recurrence we need because
	// actually constructing an add recurrence is relatively expensive.
	if (PreAR && PreAR->getNoWrapFlags(WrapType)) { // proves (2)
	const SCEV *DeltaS = getConstant(StartC->getType(), Delta);
	ICmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
	const SCEV *Limit = ExtendOpTraits<ExtendOpTy>::getOverflowLimitForStep(
	DeltaS, &Pred, this);
	if (Limit && isKnownPredicate(Pred, PreAR, Limit)) // proves (1)
	return true;
	}
	}

	return false;
	}

	const SCEV *
	ScalarEvolution::getZeroExtendExpr(const SCEV Op, Type Ty, unsigned Depth) {
	assert(getTypeSizeInBits(Op->getType()) < getTypeSizeInBits(Ty) &&
	"This is not an extending conversion!");
	assert(isSCEVable(Ty) &&
	"This is not a conversion to a SCEVable type!");
	Ty = getEffectiveSCEVType(Ty);

	// Fold if the operand is constant.
	if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(Op))
	return getConstant(
	cast<ConstantInt>(ConstantExpr::getZExt(SC->getValue(), Ty)));

	// zext(zext(x)) --> zext(x)
	if (const SCEVZeroExtendExpr *SZ = dyn_cast<SCEVZeroExtendExpr>(Op))
	return getZeroExtendExpr(SZ->getOperand(), Ty, Depth + 1);

	// Before doing any expensive analysis, check to see if we've already
	// computed a SCEV for this Op and Ty.
	FoldingSetNodeID ID;
	ID.AddInteger(scZeroExtend);
	ID.AddPointer(Op);
	ID.AddPointer(Ty);
	void *IP = nullptr;
	if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S;
	if (Depth > MaxExtDepth) {
	SCEV *S = new (SCEVAllocator) SCEVZeroExtendExpr(ID.Intern(SCEVAllocator),
	Op, Ty);
	UniqueSCEVs.InsertNode(S, IP);
	return S;
	}

	// zext(trunc(x)) --> zext(x) or x or trunc(x)
	if (const SCEVTruncateExpr *ST = dyn_cast<SCEVTruncateExpr>(Op)) {
	// It's possible the bits taken off by the truncate were all zero bits. If
	// so, we should be able to simplify this further.
	const SCEV *X = ST->getOperand();
	ConstantRange CR = getUnsignedRange(X);
	unsigned TruncBits = getTypeSizeInBits(ST->getType());
	unsigned NewBits = getTypeSizeInBits(Ty);
	if (CR.truncate(TruncBits).zeroExtend(NewBits).contains(
	CR.zextOrTrunc(NewBits)))
	return getTruncateOrZeroExtend(X, Ty);
	}

	// If the input value is a chrec scev, and we can prove that the value
	// did not overflow the old, smaller, value, we can zero extend all of the
	// operands (often constants). This allows analysis of something like
	// this: for (unsigned char X = 0; X < 100; ++X) { int Y = X; }
	if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Op))
	if (AR->isAffine()) {
	const SCEV *Start = AR->getStart();
	const SCEV Step = AR->getStepRecurrence(this);
	unsigned BitWidth = getTypeSizeInBits(AR->getType());
	const Loop *L = AR->getLoop();

	if (!AR->hasNoUnsignedWrap()) {
	auto NewFlags = proveNoWrapViaConstantRanges(AR);
	const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(NewFlags);
	}

	// If we have special knowledge that this addrec won't overflow,
	// we don't need to do any further analysis.
	if (AR->hasNoUnsignedWrap())
	return getAddRecExpr(
	getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this, Depth + 1),
	getZeroExtendExpr(Step, Ty, Depth + 1), L, AR->getNoWrapFlags());

	// Check whether the backedge-taken count is SCEVCouldNotCompute.
	// Note that this serves two purposes: It filters out loops that are
	// simply not analyzable, and it covers the case where this code is
	// being called from within backedge-taken count analysis, such that
	// attempting to ask for the backedge-taken count would likely result
	// in infinite recursion. In the later case, the analysis code will
	// cope with a conservative value, and it will take care to purge
	// that value once it has finished.
	const SCEV *MaxBECount = getMaxBackedgeTakenCount(L);
	if (!isa<SCEVCouldNotCompute>(MaxBECount)) {
	// Manually compute the final value for AR, checking for
	// overflow.

	// Check whether the backedge-taken count can be losslessly casted to
	// the addrec's type. The count is always unsigned.
	const SCEV *CastedMaxBECount =
	getTruncateOrZeroExtend(MaxBECount, Start->getType());
	const SCEV *RecastedMaxBECount =
	getTruncateOrZeroExtend(CastedMaxBECount, MaxBECount->getType());
	if (MaxBECount == RecastedMaxBECount) {
	Type WideTy = IntegerType::get(getContext(), BitWidth 2);
	// Check whether Start+Step*MaxBECount has no unsigned overflow.
	const SCEV *ZMul = getMulExpr(CastedMaxBECount, Step,
	SCEV::FlagAnyWrap, Depth + 1);
	const SCEV *ZAdd = getZeroExtendExpr(getAddExpr(Start, ZMul,
	SCEV::FlagAnyWrap,
	Depth + 1),
	WideTy, Depth + 1);
	const SCEV *WideStart = getZeroExtendExpr(Start, WideTy, Depth + 1);
	const SCEV *WideMaxBECount =
	getZeroExtendExpr(CastedMaxBECount, WideTy, Depth + 1);
	const SCEV *OperandExtendedAdd =
	getAddExpr(WideStart,
	getMulExpr(WideMaxBECount,
	getZeroExtendExpr(Step, WideTy, Depth + 1),
	SCEV::FlagAnyWrap, Depth + 1),
	SCEV::FlagAnyWrap, Depth + 1);
	if (ZAdd == OperandExtendedAdd) {
	// Cache knowledge of AR NUW, which is propagated to this AddRec.
	const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNUW);
	// Return the expression with the addrec on the outside.
	return getAddRecExpr(
	getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this,
	Depth + 1),
	getZeroExtendExpr(Step, Ty, Depth + 1), L,
	AR->getNoWrapFlags());
	}
	// Similar to above, only this time treat the step value as signed.
	// This covers loops that count down.
	OperandExtendedAdd =
	getAddExpr(WideStart,
	getMulExpr(WideMaxBECount,
	getSignExtendExpr(Step, WideTy, Depth + 1),
	SCEV::FlagAnyWrap, Depth + 1),
	SCEV::FlagAnyWrap, Depth + 1);
	if (ZAdd == OperandExtendedAdd) {
	// Cache knowledge of AR NW, which is propagated to this AddRec.
	// Negative step causes unsigned wrap, but it still can't self-wrap.
	const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNW);
	// Return the expression with the addrec on the outside.
	return getAddRecExpr(
	getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this,
	Depth + 1),
	getSignExtendExpr(Step, Ty, Depth + 1), L,
	AR->getNoWrapFlags());
	}
	}
	}

	// Normally, in the cases we can prove no-overflow via a
	// backedge guarding condition, we can also compute a backedge
	// taken count for the loop. The exceptions are assumptions and
	// guards present in the loop -- SCEV is not great at exploiting
	// these to compute max backedge taken counts, but can still use
	// these to prove lack of overflow. Use this fact to avoid
	// doing extra work that may not pay off.
	if (!isa<SCEVCouldNotCompute>(MaxBECount) \|\| HasGuards \|\|
	!AC.assumptions().empty()) {
	// If the backedge is guarded by a comparison with the pre-inc
	// value the addrec is safe. Also, if the entry is guarded by
	// a comparison with the start value and the backedge is
	// guarded by a comparison with the post-inc value, the addrec
	// is safe.
	if (isKnownPositive(Step)) {
	const SCEV *N = getConstant(APInt::getMinValue(BitWidth) -
	getUnsignedRangeMax(Step));
	if (isLoopBackedgeGuardedByCond(L, ICmpInst::ICMP_ULT, AR, N) \|\|
	(isLoopEntryGuardedByCond(L, ICmpInst::ICMP_ULT, Start, N) &&
	isLoopBackedgeGuardedByCond(L, ICmpInst::ICMP_ULT,
	AR->getPostIncExpr(*this), N))) {
	// Cache knowledge of AR NUW, which is propagated to this
	// AddRec.
	const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNUW);
	// Return the expression with the addrec on the outside.
	return getAddRecExpr(
	getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this,
	Depth + 1),
	getZeroExtendExpr(Step, Ty, Depth + 1), L,
	AR->getNoWrapFlags());
	}
	} else if (isKnownNegative(Step)) {
	const SCEV *N = getConstant(APInt::getMaxValue(BitWidth) -
	getSignedRangeMin(Step));
	if (isLoopBackedgeGuardedByCond(L, ICmpInst::ICMP_UGT, AR, N) \|\|
	(isLoopEntryGuardedByCond(L, ICmpInst::ICMP_UGT, Start, N) &&
	isLoopBackedgeGuardedByCond(L, ICmpInst::ICMP_UGT,
	AR->getPostIncExpr(*this), N))) {
	// Cache knowledge of AR NW, which is propagated to this
	// AddRec. Negative step causes unsigned wrap, but it
	// still can't self-wrap.
	const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNW);
	// Return the expression with the addrec on the outside.
	return getAddRecExpr(
	getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this,
	Depth + 1),
	getSignExtendExpr(Step, Ty, Depth + 1), L,
	AR->getNoWrapFlags());
	}
	}
	}

	if (proveNoWrapByVaryingStart<SCEVZeroExtendExpr>(Start, Step, L)) {
	const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNUW);
	return getAddRecExpr(
	getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this, Depth + 1),
	getZeroExtendExpr(Step, Ty, Depth + 1), L, AR->getNoWrapFlags());
	}
	}

	if (auto *SA = dyn_cast<SCEVAddExpr>(Op)) {
	// zext((A + B + ...)<nuw>) --> (zext(A) + zext(B) + ...)<nuw>
	if (SA->hasNoUnsignedWrap()) {
	// If the addition does not unsign overflow then we can, by definition,
	// commute the zero extension with the addition operation.
	SmallVector<const SCEV *, 4> Ops;
	for (const auto *Op : SA->operands())
	Ops.push_back(getZeroExtendExpr(Op, Ty, Depth + 1));
	return getAddExpr(Ops, SCEV::FlagNUW, Depth + 1);
	}
	}

	// The cast wasn't folded; create an explicit cast node.
	// Recompute the insert position, as it may have been invalidated.
	if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S;
	SCEV *S = new (SCEVAllocator) SCEVZeroExtendExpr(ID.Intern(SCEVAllocator),
	Op, Ty);
	UniqueSCEVs.InsertNode(S, IP);
	return S;
	}

	const SCEV *
	ScalarEvolution::getSignExtendExpr(const SCEV Op, Type Ty, unsigned Depth) {
	assert(getTypeSizeInBits(Op->getType()) < getTypeSizeInBits(Ty) &&
	"This is not an extending conversion!");
	assert(isSCEVable(Ty) &&
	"This is not a conversion to a SCEVable type!");
	Ty = getEffectiveSCEVType(Ty);

	// Fold if the operand is constant.
	if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(Op))
	return getConstant(
	cast<ConstantInt>(ConstantExpr::getSExt(SC->getValue(), Ty)));

	// sext(sext(x)) --> sext(x)
	if (const SCEVSignExtendExpr *SS = dyn_cast<SCEVSignExtendExpr>(Op))
	return getSignExtendExpr(SS->getOperand(), Ty, Depth + 1);

	// sext(zext(x)) --> zext(x)
	if (const SCEVZeroExtendExpr *SZ = dyn_cast<SCEVZeroExtendExpr>(Op))
	return getZeroExtendExpr(SZ->getOperand(), Ty, Depth + 1);

	// Before doing any expensive analysis, check to see if we've already
	// computed a SCEV for this Op and Ty.
	FoldingSetNodeID ID;
	ID.AddInteger(scSignExtend);
	ID.AddPointer(Op);
	ID.AddPointer(Ty);
	void *IP = nullptr;
	if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S;
	// Limit recursion depth.
	if (Depth > MaxExtDepth) {
	SCEV *S = new (SCEVAllocator) SCEVSignExtendExpr(ID.Intern(SCEVAllocator),
	Op, Ty);
	UniqueSCEVs.InsertNode(S, IP);
	return S;
	}

	// sext(trunc(x)) --> sext(x) or x or trunc(x)
	if (const SCEVTruncateExpr *ST = dyn_cast<SCEVTruncateExpr>(Op)) {
	// It's possible the bits taken off by the truncate were all sign bits. If
	// so, we should be able to simplify this further.
	const SCEV *X = ST->getOperand();
	ConstantRange CR = getSignedRange(X);
	unsigned TruncBits = getTypeSizeInBits(ST->getType());
	unsigned NewBits = getTypeSizeInBits(Ty);
	if (CR.truncate(TruncBits).signExtend(NewBits).contains(
	CR.sextOrTrunc(NewBits)))
	return getTruncateOrSignExtend(X, Ty);
	}

	// sext(C1 + (C2 * x)) --> C1 + sext(C2 * x) if C1 < C2
	if (auto *SA = dyn_cast<SCEVAddExpr>(Op)) {
	if (SA->getNumOperands() == 2) {
	auto *SC1 = dyn_cast<SCEVConstant>(SA->getOperand(0));
	auto *SMul = dyn_cast<SCEVMulExpr>(SA->getOperand(1));
	if (SMul && SC1) {
	if (auto *SC2 = dyn_cast<SCEVConstant>(SMul->getOperand(0))) {
	const APInt &C1 = SC1->getAPInt();
	const APInt &C2 = SC2->getAPInt();
	if (C1.isStrictlyPositive() && C2.isStrictlyPositive() &&
	C2.ugt(C1) && C2.isPowerOf2())
	return getAddExpr(getSignExtendExpr(SC1, Ty, Depth + 1),
	getSignExtendExpr(SMul, Ty, Depth + 1),
	SCEV::FlagAnyWrap, Depth + 1);
	}
	}
	}

	// sext((A + B + ...)<nsw>) --> (sext(A) + sext(B) + ...)<nsw>
	if (SA->hasNoSignedWrap()) {
	// If the addition does not sign overflow then we can, by definition,
	// commute the sign extension with the addition operation.
	SmallVector<const SCEV *, 4> Ops;
	for (const auto *Op : SA->operands())
	Ops.push_back(getSignExtendExpr(Op, Ty, Depth + 1));
	return getAddExpr(Ops, SCEV::FlagNSW, Depth + 1);
	}
	}
	// If the input value is a chrec scev, and we can prove that the value
	// did not overflow the old, smaller, value, we can sign extend all of the
	// operands (often constants). This allows analysis of something like
	// this: for (signed char X = 0; X < 100; ++X) { int Y = X; }
	if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Op))
	if (AR->isAffine()) {
	const SCEV *Start = AR->getStart();
	const SCEV Step = AR->getStepRecurrence(this);
	unsigned BitWidth = getTypeSizeInBits(AR->getType());
	const Loop *L = AR->getLoop();

	if (!AR->hasNoSignedWrap()) {
	auto NewFlags = proveNoWrapViaConstantRanges(AR);
	const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(NewFlags);
	}

	// If we have special knowledge that this addrec won't overflow,
	// we don't need to do any further analysis.
	if (AR->hasNoSignedWrap())
	return getAddRecExpr(
	getExtendAddRecStart<SCEVSignExtendExpr>(AR, Ty, this, Depth + 1),
	getSignExtendExpr(Step, Ty, Depth + 1), L, SCEV::FlagNSW);

	// Check whether the backedge-taken count is SCEVCouldNotCompute.
	// Note that this serves two purposes: It filters out loops that are
	// simply not analyzable, and it covers the case where this code is
	// being called from within backedge-taken count analysis, such that
	// attempting to ask for the backedge-taken count would likely result
	// in infinite recursion. In the later case, the analysis code will
	// cope with a conservative value, and it will take care to purge
	// that value once it has finished.
	const SCEV *MaxBECount = getMaxBackedgeTakenCount(L);
	if (!isa<SCEVCouldNotCompute>(MaxBECount)) {
	// Manually compute the final value for AR, checking for
	// overflow.

	// Check whether the backedge-taken count can be losslessly casted to
	// the addrec's type. The count is always unsigned.
	const SCEV *CastedMaxBECount =
	getTruncateOrZeroExtend(MaxBECount, Start->getType());
	const SCEV *RecastedMaxBECount =
	getTruncateOrZeroExtend(CastedMaxBECount, MaxBECount->getType());
	if (MaxBECount == RecastedMaxBECount) {
	Type WideTy = IntegerType::get(getContext(), BitWidth 2);
	// Check whether Start+Step*MaxBECount has no signed overflow.
	const SCEV *SMul = getMulExpr(CastedMaxBECount, Step,
	SCEV::FlagAnyWrap, Depth + 1);
	const SCEV *SAdd = getSignExtendExpr(getAddExpr(Start, SMul,
	SCEV::FlagAnyWrap,
	Depth + 1),
	WideTy, Depth + 1);
	const SCEV *WideStart = getSignExtendExpr(Start, WideTy, Depth + 1);
	const SCEV *WideMaxBECount =
	getZeroExtendExpr(CastedMaxBECount, WideTy, Depth + 1);
	const SCEV *OperandExtendedAdd =
	getAddExpr(WideStart,
	getMulExpr(WideMaxBECount,
	getSignExtendExpr(Step, WideTy, Depth + 1),
	SCEV::FlagAnyWrap, Depth + 1),
	SCEV::FlagAnyWrap, Depth + 1);
	if (SAdd == OperandExtendedAdd) {
	// Cache knowledge of AR NSW, which is propagated to this AddRec.
	const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNSW);
	// Return the expression with the addrec on the outside.
	return getAddRecExpr(
	getExtendAddRecStart<SCEVSignExtendExpr>(AR, Ty, this,
	Depth + 1),
	getSignExtendExpr(Step, Ty, Depth + 1), L,
	AR->getNoWrapFlags());
	}
	// Similar to above, only this time treat the step value as unsigned.
	// This covers loops that count up with an unsigned step.
	OperandExtendedAdd =
	getAddExpr(WideStart,
	getMulExpr(WideMaxBECount,
	getZeroExtendExpr(Step, WideTy, Depth + 1),
	SCEV::FlagAnyWrap, Depth + 1),
	SCEV::FlagAnyWrap, Depth + 1);
	if (SAdd == OperandExtendedAdd) {
	// If AR wraps around then
	//
	// abs(Step) * MaxBECount > unsigned-max(AR->getType())
	// => SAdd != OperandExtendedAdd
	//
	// Thus (AR is not NW => SAdd != OperandExtendedAdd) <=>
	// (SAdd == OperandExtendedAdd => AR is NW)

	const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNW);

	// Return the expression with the addrec on the outside.
	return getAddRecExpr(
	getExtendAddRecStart<SCEVSignExtendExpr>(AR, Ty, this,
	Depth + 1),
	getZeroExtendExpr(Step, Ty, Depth + 1), L,
	AR->getNoWrapFlags());
	}
	}
	}

	// Normally, in the cases we can prove no-overflow via a
	// backedge guarding condition, we can also compute a backedge
	// taken count for the loop. The exceptions are assumptions and
	// guards present in the loop -- SCEV is not great at exploiting
	// these to compute max backedge taken counts, but can still use
	// these to prove lack of overflow. Use this fact to avoid
	// doing extra work that may not pay off.

	if (!isa<SCEVCouldNotCompute>(MaxBECount) \|\| HasGuards \|\|
	!AC.assumptions().empty()) {
	// If the backedge is guarded by a comparison with the pre-inc
	// value the addrec is safe. Also, if the entry is guarded by
	// a comparison with the start value and the backedge is
	// guarded by a comparison with the post-inc value, the addrec
	// is safe.
	ICmpInst::Predicate Pred;
	const SCEV *OverflowLimit =
	getSignedOverflowLimitForStep(Step, &Pred, this);
	if (OverflowLimit &&
	(isLoopBackedgeGuardedByCond(L, Pred, AR, OverflowLimit) \|\|
	(isLoopEntryGuardedByCond(L, Pred, Start, OverflowLimit) &&
	isLoopBackedgeGuardedByCond(L, Pred, AR->getPostIncExpr(*this),
	OverflowLimit)))) {
	// Cache knowledge of AR NSW, then propagate NSW to the wide AddRec.
	const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNSW);
	return getAddRecExpr(
	getExtendAddRecStart<SCEVSignExtendExpr>(AR, Ty, this, Depth + 1),
	getSignExtendExpr(Step, Ty, Depth + 1), L, AR->getNoWrapFlags());
	}
	}

	// If Start and Step are constants, check if we can apply this
	// transformation:
	// sext{C1,+,C2} --> C1 + sext{0,+,C2} if C1 < C2
	auto *SC1 = dyn_cast<SCEVConstant>(Start);
	auto *SC2 = dyn_cast<SCEVConstant>(Step);
	if (SC1 && SC2) {
	const APInt &C1 = SC1->getAPInt();
	const APInt &C2 = SC2->getAPInt();
	if (C1.isStrictlyPositive() && C2.isStrictlyPositive() && C2.ugt(C1) &&
	C2.isPowerOf2()) {
	Start = getSignExtendExpr(Start, Ty, Depth + 1);
	const SCEV *NewAR = getAddRecExpr(getZero(AR->getType()), Step, L,
	AR->getNoWrapFlags());
	return getAddExpr(Start, getSignExtendExpr(NewAR, Ty, Depth + 1),
	SCEV::FlagAnyWrap, Depth + 1);
	}
	}

	if (proveNoWrapByVaryingStart<SCEVSignExtendExpr>(Start, Step, L)) {
	const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNSW);
	return getAddRecExpr(
	getExtendAddRecStart<SCEVSignExtendExpr>(AR, Ty, this, Depth + 1),
	getSignExtendExpr(Step, Ty, Depth + 1), L, AR->getNoWrapFlags());
	}
	}

	// If the input value is provably positive and we could not simplify
	// away the sext build a zext instead.
	if (isKnownNonNegative(Op))
	return getZeroExtendExpr(Op, Ty, Depth + 1);

	// The cast wasn't folded; create an explicit cast node.
	// Recompute the insert position, as it may have been invalidated.
	if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S;
	SCEV *S = new (SCEVAllocator) SCEVSignExtendExpr(ID.Intern(SCEVAllocator),
	Op, Ty);
	UniqueSCEVs.InsertNode(S, IP);
	return S;
	}

	/// getAnyExtendExpr - Return a SCEV for the given operand extended with
	/// unspecified bits out to the given type.
	///
	const SCEV ScalarEvolution::getAnyExtendExpr(const SCEV Op,
	Type *Ty) {
	assert(getTypeSizeInBits(Op->getType()) < getTypeSizeInBits(Ty) &&
	"This is not an extending conversion!");
	assert(isSCEVable(Ty) &&
	"This is not a conversion to a SCEVable type!");
	Ty = getEffectiveSCEVType(Ty);

	// Sign-extend negative constants.
	if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(Op))
	if (SC->getAPInt().isNegative())
	return getSignExtendExpr(Op, Ty);

	// Peel off a truncate cast.
	if (const SCEVTruncateExpr *T = dyn_cast<SCEVTruncateExpr>(Op)) {
	const SCEV *NewOp = T->getOperand();
	if (getTypeSizeInBits(NewOp->getType()) < getTypeSizeInBits(Ty))
	return getAnyExtendExpr(NewOp, Ty);
	return getTruncateOrNoop(NewOp, Ty);
	}

	// Next try a zext cast. If the cast is folded, use it.
	const SCEV *ZExt = getZeroExtendExpr(Op, Ty);
	if (!isa<SCEVZeroExtendExpr>(ZExt))
	return ZExt;

	// Next try a sext cast. If the cast is folded, use it.
	const SCEV *SExt = getSignExtendExpr(Op, Ty);
	if (!isa<SCEVSignExtendExpr>(SExt))
	return SExt;

	// Force the cast to be folded into the operands of an addrec.
	if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Op)) {
	SmallVector<const SCEV *, 4> Ops;
	for (const SCEV *Op : AR->operands())
	Ops.push_back(getAnyExtendExpr(Op, Ty));
	return getAddRecExpr(Ops, AR->getLoop(), SCEV::FlagNW);
	}

	// If the expression is obviously signed, use the sext cast value.
	if (isa<SCEVSMaxExpr>(Op))
	return SExt;

	// Absent any other information, use the zext cast value.
	return ZExt;
	}

	/// Process the given Ops list, which is a list of operands to be added under
	/// the given scale, update the given map. This is a helper function for
	/// getAddRecExpr. As an example of what it does, given a sequence of operands
	/// that would form an add expression like this:
	///
	/// m + n + 13 + (A * (o + p + (B * (q + m + 29)))) + r + (-1 * r)
	///
	/// where A and B are constants, update the map with these values:
	///
	/// (m, 1+AB), (n, 1), (o, A), (p, A), (q, AB), (r, 0)
	///
	/// and add 13 + AB29 to AccumulatedConstant.
	/// This will allow getAddRecExpr to produce this:
	///
	/// 13+AB29 + n + (m * (1+AB)) + ((o + p) A) + (q * A*B)
	///
	/// This form often exposes folding opportunities that are hidden in
	/// the original operand list.
	///
	/// Return true iff it appears that any interesting folding opportunities
	/// may be exposed. This helps getAddRecExpr short-circuit extra work in
	/// the common case where no interesting opportunities are present, and
	/// is also used as a check to avoid infinite recursion.
	///
	static bool
	CollectAddOperandsWithScales(DenseMap<const SCEV *, APInt> &M,
	SmallVectorImpl<const SCEV *> &NewOps,
	APInt &AccumulatedConstant,
	const SCEV const Ops, size_t NumOperands,
	const APInt &Scale,
	ScalarEvolution &SE) {
	bool Interesting = false;

	// Iterate over the add operands. They are sorted, with constants first.
	unsigned i = 0;
	while (const SCEVConstant *C = dyn_cast<SCEVConstant>(Ops[i])) {
	++i;
	// Pull a buried constant out to the outside.
	if (Scale != 1 \|\| AccumulatedConstant != 0 \|\| C->getValue()->isZero())
	Interesting = true;
	AccumulatedConstant += Scale * C->getAPInt();
	}

	// Next comes everything else. We're especially interested in multiplies
	// here, but they're in the middle, so just visit the rest with one loop.
	for (; i != NumOperands; ++i) {
	const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(Ops[i]);
	if (Mul && isa<SCEVConstant>(Mul->getOperand(0))) {
	APInt NewScale =
	Scale * cast<SCEVConstant>(Mul->getOperand(0))->getAPInt();
	if (Mul->getNumOperands() == 2 && isa<SCEVAddExpr>(Mul->getOperand(1))) {
	// A multiplication of a constant with another add; recurse.
	const SCEVAddExpr *Add = cast<SCEVAddExpr>(Mul->getOperand(1));
	Interesting \|=
	CollectAddOperandsWithScales(M, NewOps, AccumulatedConstant,
	Add->op_begin(), Add->getNumOperands(),
	NewScale, SE);
	} else {
	// A multiplication of a constant with some other value. Update
	// the map.
	SmallVector<const SCEV *, 4> MulOps(Mul->op_begin()+1, Mul->op_end());
	const SCEV *Key = SE.getMulExpr(MulOps);
	auto Pair = M.insert({Key, NewScale});
	if (Pair.second) {
	NewOps.push_back(Pair.first->first);
	} else {
	Pair.first->second += NewScale;
	// The map already had an entry for this value, which may indicate
	// a folding opportunity.
	Interesting = true;
	}
	}
	} else {
	// An ordinary operand. Update the map.
	std::pair<DenseMap<const SCEV *, APInt>::iterator, bool> Pair =
	M.insert({Ops[i], Scale});
	if (Pair.second) {
	NewOps.push_back(Pair.first->first);
	} else {
	Pair.first->second += Scale;
	// The map already had an entry for this value, which may indicate
	// a folding opportunity.
	Interesting = true;
	}
	}
	}

	return Interesting;
	}

	// We're trying to construct a SCEV of type `Type' with `Ops' as operands and
	// `OldFlags' as can't-wrap behavior. Infer a more aggressive set of
	// can't-overflow flags for the operation if possible.
	static SCEV::NoWrapFlags
	StrengthenNoWrapFlags(ScalarEvolution *SE, SCEVTypes Type,
	const SmallVectorImpl<const SCEV *> &Ops,
	SCEV::NoWrapFlags Flags) {
	using namespace std::placeholders;
	typedef OverflowingBinaryOperator OBO;

	bool CanAnalyze =
	Type == scAddExpr \|\| Type == scAddRecExpr \|\| Type == scMulExpr;
	(void)CanAnalyze;
	assert(CanAnalyze && "don't call from other places!");

	int SignOrUnsignMask = SCEV::FlagNUW \| SCEV::FlagNSW;
	SCEV::NoWrapFlags SignOrUnsignWrap =
	ScalarEvolution::maskFlags(Flags, SignOrUnsignMask);

	// If FlagNSW is true and all the operands are non-negative, infer FlagNUW.
	auto IsKnownNonNegative = [&](const SCEV *S) {
	return SE->isKnownNonNegative(S);
	};

	if (SignOrUnsignWrap == SCEV::FlagNSW && all_of(Ops, IsKnownNonNegative))
	Flags =
	ScalarEvolution::setFlags(Flags, (SCEV::NoWrapFlags)SignOrUnsignMask);

	SignOrUnsignWrap = ScalarEvolution::maskFlags(Flags, SignOrUnsignMask);

	if (SignOrUnsignWrap != SignOrUnsignMask && Type == scAddExpr &&
	Ops.size() == 2 && isa<SCEVConstant>(Ops[0])) {

	// (A + C) --> (A + C)<nsw> if the addition does not sign overflow
	// (A + C) --> (A + C)<nuw> if the addition does not unsign overflow

	const APInt &C = cast<SCEVConstant>(Ops[0])->getAPInt();
	if (!(SignOrUnsignWrap & SCEV::FlagNSW)) {
	auto NSWRegion = ConstantRange::makeGuaranteedNoWrapRegion(
	Instruction::Add, C, OBO::NoSignedWrap);
	if (NSWRegion.contains(SE->getSignedRange(Ops[1])))
	Flags = ScalarEvolution::setFlags(Flags, SCEV::FlagNSW);
	}
	if (!(SignOrUnsignWrap & SCEV::FlagNUW)) {
	auto NUWRegion = ConstantRange::makeGuaranteedNoWrapRegion(
	Instruction::Add, C, OBO::NoUnsignedWrap);
	if (NUWRegion.contains(SE->getUnsignedRange(Ops[1])))
	Flags = ScalarEvolution::setFlags(Flags, SCEV::FlagNUW);
	}
	}

	return Flags;
	}

	bool ScalarEvolution::isAvailableAtLoopEntry(const SCEV S, const Loop L) {
	if (!isLoopInvariant(S, L))
	return false;
	// If a value depends on a SCEVUnknown which is defined after the loop, we
	// conservatively assume that we cannot calculate it at the loop's entry.
	struct FindDominatedSCEVUnknown {
	bool Found = false;
	const Loop *L;
	DominatorTree &DT;
	LoopInfo &LI;

	FindDominatedSCEVUnknown(const Loop *L, DominatorTree &DT, LoopInfo &LI)
	: L(L), DT(DT), LI(LI) {}

	bool checkSCEVUnknown(const SCEVUnknown *SU) {
	if (auto *I = dyn_cast<Instruction>(SU->getValue())) {
	if (DT.dominates(L->getHeader(), I->getParent()))
	Found = true;
	else
	assert(DT.dominates(I->getParent(), L->getHeader()) &&
	"No dominance relationship between SCEV and loop?");
	}
	return false;
	}

	bool follow(const SCEV *S) {
	switch (static_cast<SCEVTypes>(S->getSCEVType())) {
	case scConstant:
	return false;
	case scAddRecExpr:
	case scTruncate:
	case scZeroExtend:
	case scSignExtend:
	case scAddExpr:
	case scMulExpr:
	case scUMaxExpr:
	case scSMaxExpr:
	case scUDivExpr:
	return true;
	case scUnknown:
	return checkSCEVUnknown(cast<SCEVUnknown>(S));
	case scCouldNotCompute:
	llvm_unreachable("Attempt to use a SCEVCouldNotCompute object!");
	}
	return false;
	}

	bool isDone() { return Found; }
	};

	FindDominatedSCEVUnknown FSU(L, DT, LI);
	SCEVTraversal<FindDominatedSCEVUnknown> ST(FSU);
	ST.visitAll(S);
	return !FSU.Found;
	}

	/// Get a canonical add expression, or something simpler if possible.
	const SCEV ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV > &Ops,
	SCEV::NoWrapFlags Flags,
	unsigned Depth) {
	assert(!(Flags & ~(SCEV::FlagNUW \| SCEV::FlagNSW)) &&
	"only nuw or nsw allowed");
	assert(!Ops.empty() && "Cannot get empty add!");
	if (Ops.size() == 1) return Ops[0];
	#ifndef NDEBUG
	Type *ETy = getEffectiveSCEVType(Ops[0]->getType());
	for (unsigned i = 1, e = Ops.size(); i != e; ++i)
	assert(getEffectiveSCEVType(Ops[i]->getType()) == ETy &&
	"SCEVAddExpr operand types don't match!");
	#endif

	// Sort by complexity, this groups all similar expression types together.
	GroupByComplexity(Ops, &LI, DT);

	Flags = StrengthenNoWrapFlags(this, scAddExpr, Ops, Flags);

	// If there are any constants, fold them together.
	unsigned Idx = 0;
	if (const SCEVConstant *LHSC = dyn_cast<SCEVConstant>(Ops[0])) {
	++Idx;
	assert(Idx < Ops.size());
	while (const SCEVConstant *RHSC = dyn_cast<SCEVConstant>(Ops[Idx])) {
	// We found two constants, fold them together!
	Ops[0] = getConstant(LHSC->getAPInt() + RHSC->getAPInt());
	if (Ops.size() == 2) return Ops[0];
	Ops.erase(Ops.begin()+1); // Erase the folded element
	LHSC = cast<SCEVConstant>(Ops[0]);
	}

	// If we are left with a constant zero being added, strip it off.
	if (LHSC->getValue()->isZero()) {
	Ops.erase(Ops.begin());
	--Idx;
	}

	if (Ops.size() == 1) return Ops[0];
	}

	// Limit recursion calls depth.
	if (Depth > MaxArithDepth)
	return getOrCreateAddExpr(Ops, Flags);

	// Okay, check to see if the same value occurs in the operand list more than
	// once. If so, merge them together into an multiply expression. Since we
	// sorted the list, these values are required to be adjacent.
	Type *Ty = Ops[0]->getType();
	bool FoundMatch = false;
	for (unsigned i = 0, e = Ops.size(); i != e-1; ++i)
	if (Ops[i] == Ops[i+1]) { // X + Y + Y --> X + Y*2
	// Scan ahead to count how many equal operands there are.
	unsigned Count = 2;
	while (i+Count != e && Ops[i+Count] == Ops[i])
	++Count;
	// Merge the values into a multiply.
	const SCEV *Scale = getConstant(Ty, Count);
	const SCEV *Mul = getMulExpr(Scale, Ops[i], SCEV::FlagAnyWrap, Depth + 1);
	if (Ops.size() == Count)
	return Mul;
	Ops[i] = Mul;
	Ops.erase(Ops.begin()+i+1, Ops.begin()+i+Count);
	--i; e -= Count - 1;
	FoundMatch = true;
	}
	if (FoundMatch)
	return getAddExpr(Ops, Flags);

	// Check for truncates. If all the operands are truncated from the same
	// type, see if factoring out the truncate would permit the result to be
	// folded. eg., trunc(x) + mtrunc(n) --> trunc(x + trunc(m)n)
	// if the contents of the resulting outer trunc fold to something simple.
	for (; Idx < Ops.size() && isa<SCEVTruncateExpr>(Ops[Idx]); ++Idx) {
	const SCEVTruncateExpr *Trunc = cast<SCEVTruncateExpr>(Ops[Idx]);
	Type *DstType = Trunc->getType();
	Type *SrcType = Trunc->getOperand()->getType();
	SmallVector<const SCEV *, 8> LargeOps;
	bool Ok = true;
	// Check all the operands to see if they can be represented in the
	// source type of the truncate.
	for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
	if (const SCEVTruncateExpr *T = dyn_cast<SCEVTruncateExpr>(Ops[i])) {
	if (T->getOperand()->getType() != SrcType) {
	Ok = false;
	break;
	}
	LargeOps.push_back(T->getOperand());
	} else if (const SCEVConstant *C = dyn_cast<SCEVConstant>(Ops[i])) {
	LargeOps.push_back(getAnyExtendExpr(C, SrcType));
	} else if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(Ops[i])) {
	SmallVector<const SCEV *, 8> LargeMulOps;
	for (unsigned j = 0, f = M->getNumOperands(); j != f && Ok; ++j) {
	if (const SCEVTruncateExpr *T =
	dyn_cast<SCEVTruncateExpr>(M->getOperand(j))) {
	if (T->getOperand()->getType() != SrcType) {
	Ok = false;
	break;
	}
	LargeMulOps.push_back(T->getOperand());
	} else if (const auto *C = dyn_cast<SCEVConstant>(M->getOperand(j))) {
	LargeMulOps.push_back(getAnyExtendExpr(C, SrcType));
	} else {
	Ok = false;
	break;
	}
	}
	if (Ok)
	LargeOps.push_back(getMulExpr(LargeMulOps, SCEV::FlagAnyWrap, Depth + 1));
	} else {
	Ok = false;
	break;
	}
	}
	if (Ok) {
	// Evaluate the expression in the larger type.
	const SCEV *Fold = getAddExpr(LargeOps, Flags, Depth + 1);
	// If it folds to something simple, use it. Otherwise, don't.
	if (isa<SCEVConstant>(Fold) \|\| isa<SCEVUnknown>(Fold))
	return getTruncateExpr(Fold, DstType);
	}
	}

	// Skip past any other cast SCEVs.
	while (Idx < Ops.size() && Ops[Idx]->getSCEVType() < scAddExpr)
	++Idx;

	// If there are add operands they would be next.
	if (Idx < Ops.size()) {
	bool DeletedAdd = false;
	while (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(Ops[Idx])) {
	if (Ops.size() > AddOpsInlineThreshold \|\|
	Add->getNumOperands() > AddOpsInlineThreshold)
	break;
	// If we have an add, expand the add operands onto the end of the operands
	// list.
	Ops.erase(Ops.begin()+Idx);
	Ops.append(Add->op_begin(), Add->op_end());
	DeletedAdd = true;
	}

	// If we deleted at least one add, we added operands to the end of the list,
	// and they are not necessarily sorted. Recurse to resort and resimplify
	// any operands we just acquired.
	if (DeletedAdd)
	return getAddExpr(Ops, SCEV::FlagAnyWrap, Depth + 1);
	}

	// Skip over the add expression until we get to a multiply.
	while (Idx < Ops.size() && Ops[Idx]->getSCEVType() < scMulExpr)
	++Idx;

	// Check to see if there are any folding opportunities present with
	// operands multiplied by constant values.
	if (Idx < Ops.size() && isa<SCEVMulExpr>(Ops[Idx])) {
	uint64_t BitWidth = getTypeSizeInBits(Ty);
	DenseMap<const SCEV *, APInt> M;
	SmallVector<const SCEV *, 8> NewOps;
	APInt AccumulatedConstant(BitWidth, 0);
	if (CollectAddOperandsWithScales(M, NewOps, AccumulatedConstant,
	Ops.data(), Ops.size(),
	APInt(BitWidth, 1), *this)) {
	struct APIntCompare {
	bool operator()(const APInt &LHS, const APInt &RHS) const {
	return LHS.ult(RHS);
	}
	};

	// Some interesting folding opportunity is present, so its worthwhile to
	// re-generate the operands list. Group the operands by constant scale,
	// to avoid multiplying by the same constant scale multiple times.
	std::map<APInt, SmallVector<const SCEV *, 4>, APIntCompare> MulOpLists;
	for (const SCEV *NewOp : NewOps)
	MulOpLists[M.find(NewOp)->second].push_back(NewOp);
	// Re-generate the operands list.
	Ops.clear();
	if (AccumulatedConstant != 0)
	Ops.push_back(getConstant(AccumulatedConstant));
	for (auto &MulOp : MulOpLists)
	if (MulOp.first != 0)
	Ops.push_back(getMulExpr(
	getConstant(MulOp.first),
	getAddExpr(MulOp.second, SCEV::FlagAnyWrap, Depth + 1),
	SCEV::FlagAnyWrap, Depth + 1));
	if (Ops.empty())
	return getZero(Ty);
	if (Ops.size() == 1)
	return Ops[0];
	return getAddExpr(Ops, SCEV::FlagAnyWrap, Depth + 1);
	}
	}

	// If we are adding something to a multiply expression, make sure the
	// something is not already an operand of the multiply. If so, merge it into
	// the multiply.
	for (; Idx < Ops.size() && isa<SCEVMulExpr>(Ops[Idx]); ++Idx) {
	const SCEVMulExpr *Mul = cast<SCEVMulExpr>(Ops[Idx]);
	for (unsigned MulOp = 0, e = Mul->getNumOperands(); MulOp != e; ++MulOp) {
	const SCEV *MulOpSCEV = Mul->getOperand(MulOp);
	if (isa<SCEVConstant>(MulOpSCEV))
	continue;
	for (unsigned AddOp = 0, e = Ops.size(); AddOp != e; ++AddOp)
	if (MulOpSCEV == Ops[AddOp]) {
	// Fold W + X + (X * Y * Z) --> W + (X * ((Y*Z)+1))
	const SCEV *InnerMul = Mul->getOperand(MulOp == 0);
	if (Mul->getNumOperands() != 2) {
	// If the multiply has more than two operands, we must get the
	// Y*Z term.
	SmallVector<const SCEV *, 4> MulOps(Mul->op_begin(),
	Mul->op_begin()+MulOp);
	MulOps.append(Mul->op_begin()+MulOp+1, Mul->op_end());
	InnerMul = getMulExpr(MulOps, SCEV::FlagAnyWrap, Depth + 1);
	}
	SmallVector<const SCEV *, 2> TwoOps = {getOne(Ty), InnerMul};
	const SCEV *AddOne = getAddExpr(TwoOps, SCEV::FlagAnyWrap, Depth + 1);
	const SCEV *OuterMul = getMulExpr(AddOne, MulOpSCEV,
	SCEV::FlagAnyWrap, Depth + 1);
	if (Ops.size() == 2) return OuterMul;
	if (AddOp < Idx) {
	Ops.erase(Ops.begin()+AddOp);
	Ops.erase(Ops.begin()+Idx-1);
	} else {
	Ops.erase(Ops.begin()+Idx);
	Ops.erase(Ops.begin()+AddOp-1);
	}
	Ops.push_back(OuterMul);
	return getAddExpr(Ops, SCEV::FlagAnyWrap, Depth + 1);
	}

	// Check this multiply against other multiplies being added together.
	for (unsigned OtherMulIdx = Idx+1;
	OtherMulIdx < Ops.size() && isa<SCEVMulExpr>(Ops[OtherMulIdx]);
	++OtherMulIdx) {
	const SCEVMulExpr *OtherMul = cast<SCEVMulExpr>(Ops[OtherMulIdx]);
	// If MulOp occurs in OtherMul, we can fold the two multiplies
	// together.
	for (unsigned OMulOp = 0, e = OtherMul->getNumOperands();
	OMulOp != e; ++OMulOp)
	if (OtherMul->getOperand(OMulOp) == MulOpSCEV) {
	// Fold X + (ABC) + (ADE) --> X + (A(BC+D*E))
	const SCEV *InnerMul1 = Mul->getOperand(MulOp == 0);
	if (Mul->getNumOperands() != 2) {
	SmallVector<const SCEV *, 4> MulOps(Mul->op_begin(),
	Mul->op_begin()+MulOp);
	MulOps.append(Mul->op_begin()+MulOp+1, Mul->op_end());
	InnerMul1 = getMulExpr(MulOps, SCEV::FlagAnyWrap, Depth + 1);
	}
	const SCEV *InnerMul2 = OtherMul->getOperand(OMulOp == 0);
	if (OtherMul->getNumOperands() != 2) {
	SmallVector<const SCEV *, 4> MulOps(OtherMul->op_begin(),
	OtherMul->op_begin()+OMulOp);
	MulOps.append(OtherMul->op_begin()+OMulOp+1, OtherMul->op_end());
	InnerMul2 = getMulExpr(MulOps, SCEV::FlagAnyWrap, Depth + 1);
	}
	SmallVector<const SCEV *, 2> TwoOps = {InnerMul1, InnerMul2};
	const SCEV *InnerMulSum =
	getAddExpr(TwoOps, SCEV::FlagAnyWrap, Depth + 1);
	const SCEV *OuterMul = getMulExpr(MulOpSCEV, InnerMulSum,
	SCEV::FlagAnyWrap, Depth + 1);
	if (Ops.size() == 2) return OuterMul;
	Ops.erase(Ops.begin()+Idx);
	Ops.erase(Ops.begin()+OtherMulIdx-1);
	Ops.push_back(OuterMul);
	return getAddExpr(Ops, SCEV::FlagAnyWrap, Depth + 1);
	}
	}
	}
	}

	// If there are any add recurrences in the operands list, see if any other
	// added values are loop invariant. If so, we can fold them into the
	// recurrence.
	while (Idx < Ops.size() && Ops[Idx]->getSCEVType() < scAddRecExpr)
	++Idx;

	// Scan over all recurrences, trying to fold loop invariants into them.
	for (; Idx < Ops.size() && isa<SCEVAddRecExpr>(Ops[Idx]); ++Idx) {
	// Scan all of the other operands to this add and add them to the vector if
	// they are loop invariant w.r.t. the recurrence.
	SmallVector<const SCEV *, 8> LIOps;
	const SCEVAddRecExpr *AddRec = cast<SCEVAddRecExpr>(Ops[Idx]);
	const Loop *AddRecLoop = AddRec->getLoop();
	for (unsigned i = 0, e = Ops.size(); i != e; ++i)
	if (isAvailableAtLoopEntry(Ops[i], AddRecLoop)) {
	LIOps.push_back(Ops[i]);
	Ops.erase(Ops.begin()+i);
	--i; --e;
	}

	// If we found some loop invariants, fold them into the recurrence.
	if (!LIOps.empty()) {
	// NLI + LI + {Start,+,Step} --> NLI + {LI+Start,+,Step}
	LIOps.push_back(AddRec->getStart());

	SmallVector<const SCEV *, 4> AddRecOps(AddRec->op_begin(),
	AddRec->op_end());
	// This follows from the fact that the no-wrap flags on the outer add
	// expression are applicable on the 0th iteration, when the add recurrence
	// will be equal to its start value.
	AddRecOps[0] = getAddExpr(LIOps, Flags, Depth + 1);

	// Build the new addrec. Propagate the NUW and NSW flags if both the
	// outer add and the inner addrec are guaranteed to have no overflow.
	// Always propagate NW.
	Flags = AddRec->getNoWrapFlags(setFlags(Flags, SCEV::FlagNW));
	const SCEV *NewRec = getAddRecExpr(AddRecOps, AddRecLoop, Flags);

	// If all of the other operands were loop invariant, we are done.
	if (Ops.size() == 1) return NewRec;

	// Otherwise, add the folded AddRec by the non-invariant parts.
	for (unsigned i = 0;; ++i)
	if (Ops[i] == AddRec) {
	Ops[i] = NewRec;
	break;
	}
	return getAddExpr(Ops, SCEV::FlagAnyWrap, Depth + 1);
	}

	// Okay, if there weren't any loop invariants to be folded, check to see if
	// there are multiple AddRec's with the same loop induction variable being
	// added together. If so, we can fold them.
	for (unsigned OtherIdx = Idx+1;
	OtherIdx < Ops.size() && isa<SCEVAddRecExpr>(Ops[OtherIdx]);
	++OtherIdx) {
	// We expect the AddRecExpr's to be sorted in reverse dominance order,
	// so that the 1st found AddRecExpr is dominated by all others.
	assert(DT.dominates(
	cast<SCEVAddRecExpr>(Ops[OtherIdx])->getLoop()->getHeader(),
	AddRec->getLoop()->getHeader()) &&
	"AddRecExprs are not sorted in reverse dominance order?");
	if (AddRecLoop == cast<SCEVAddRecExpr>(Ops[OtherIdx])->getLoop()) {
	// Other + {A,+,B}<L> + {C,+,D}<L> --> Other + {A+C,+,B+D}<L>
	SmallVector<const SCEV *, 4> AddRecOps(AddRec->op_begin(),
	AddRec->op_end());
	for (; OtherIdx != Ops.size() && isa<SCEVAddRecExpr>(Ops[OtherIdx]);
	++OtherIdx) {
	const auto *OtherAddRec = cast<SCEVAddRecExpr>(Ops[OtherIdx]);
	if (OtherAddRec->getLoop() == AddRecLoop) {
	for (unsigned i = 0, e = OtherAddRec->getNumOperands();
	i != e; ++i) {
	if (i >= AddRecOps.size()) {
	AddRecOps.append(OtherAddRec->op_begin()+i,
	OtherAddRec->op_end());
	break;
	}
	SmallVector<const SCEV *, 2> TwoOps = {
	AddRecOps[i], OtherAddRec->getOperand(i)};
	AddRecOps[i] = getAddExpr(TwoOps, SCEV::FlagAnyWrap, Depth + 1);
	}
	Ops.erase(Ops.begin() + OtherIdx); --OtherIdx;
	}
	}
	// Step size has changed, so we cannot guarantee no self-wraparound.
	Ops[Idx] = getAddRecExpr(AddRecOps, AddRecLoop, SCEV::FlagAnyWrap);
	return getAddExpr(Ops, SCEV::FlagAnyWrap, Depth + 1);
	}
	}

	// Otherwise couldn't fold anything into this recurrence. Move onto the
	// next one.
	}

	// Okay, it looks like we really DO need an add expr. Check to see if we
	// already have one, otherwise create a new one.
	return getOrCreateAddExpr(Ops, Flags);
	}

	const SCEV *
	ScalarEvolution::getOrCreateAddExpr(SmallVectorImpl<const SCEV *> &Ops,
	SCEV::NoWrapFlags Flags) {
	FoldingSetNodeID ID;
	ID.AddInteger(scAddExpr);
	for (unsigned i = 0, e = Ops.size(); i != e; ++i)
	ID.AddPointer(Ops[i]);
	void *IP = nullptr;
	SCEVAddExpr *S =
	static_cast<SCEVAddExpr *>(UniqueSCEVs.FindNodeOrInsertPos(ID, IP));
	if (!S) {
	const SCEV *O = SCEVAllocator.Allocate<const SCEV >(Ops.size());
	std::uninitialized_copy(Ops.begin(), Ops.end(), O);
	S = new (SCEVAllocator)
	SCEVAddExpr(ID.Intern(SCEVAllocator), O, Ops.size());
	UniqueSCEVs.InsertNode(S, IP);
	}
	S->setNoWrapFlags(Flags);
	return S;
	}

	const SCEV *
	ScalarEvolution::getOrCreateMulExpr(SmallVectorImpl<const SCEV *> &Ops,
	SCEV::NoWrapFlags Flags) {
	FoldingSetNodeID ID;
	ID.AddInteger(scMulExpr);
	for (unsigned i = 0, e = Ops.size(); i != e; ++i)
	ID.AddPointer(Ops[i]);
	void *IP = nullptr;
	SCEVMulExpr *S =
	static_cast<SCEVMulExpr *>(UniqueSCEVs.FindNodeOrInsertPos(ID, IP));
	if (!S) {
	const SCEV *O = SCEVAllocator.Allocate<const SCEV >(Ops.size());
	std::uninitialized_copy(Ops.begin(), Ops.end(), O);
	S = new (SCEVAllocator) SCEVMulExpr(ID.Intern(SCEVAllocator),
	O, Ops.size());
	UniqueSCEVs.InsertNode(S, IP);
	}
	S->setNoWrapFlags(Flags);
	return S;
	}

	static uint64_t umul_ov(uint64_t i, uint64_t j, bool &Overflow) {
	uint64_t k = i*j;
	if (j > 1 && k / j != i) Overflow = true;
	return k;
	}

	/// Compute the result of "n choose k", the binomial coefficient. If an
	/// intermediate computation overflows, Overflow will be set and the return will
	/// be garbage. Overflow is not cleared on absence of overflow.
	static uint64_t Choose(uint64_t n, uint64_t k, bool &Overflow) {
	// We use the multiplicative formula:
	// n(n-1)(n-2)...(n-(k-1)) / k(k-1)(k-2)...1 .
	// At each iteration, we take the n-th term of the numeral and divide by the
	// (k-n)th term of the denominator. This division will always produce an
	// integral result, and helps reduce the chance of overflow in the
	// intermediate computations. However, we can still overflow even when the
	// final result would fit.

	if (n == 0 \|\| n == k) return 1;
	if (k > n) return 0;

	if (k > n/2)
	k = n-k;

	uint64_t r = 1;
	for (uint64_t i = 1; i <= k; ++i) {
	r = umul_ov(r, n-(i-1), Overflow);
	r /= i;
	}
	return r;
	}

	/// Determine if any of the operands in this SCEV are a constant or if
	/// any of the add or multiply expressions in this SCEV contain a constant.
	static bool containsConstantSomewhere(const SCEV *StartExpr) {
	SmallVector<const SCEV *, 4> Ops;
	Ops.push_back(StartExpr);
	while (!Ops.empty()) {
	const SCEV *CurrentExpr = Ops.pop_back_val();
	if (isa<SCEVConstant>(*CurrentExpr))
	return true;

	if (isa<SCEVAddExpr>(CurrentExpr) \|\| isa<SCEVMulExpr>(CurrentExpr)) {
	const auto *CurrentNAry = cast<SCEVNAryExpr>(CurrentExpr);
	Ops.append(CurrentNAry->op_begin(), CurrentNAry->op_end());
	}
	}
	return false;
	}

	/// Get a canonical multiply expression, or something simpler if possible.
	const SCEV ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV > &Ops,
	SCEV::NoWrapFlags Flags,
	unsigned Depth) {
	assert(Flags == maskFlags(Flags, SCEV::FlagNUW \| SCEV::FlagNSW) &&
	"only nuw or nsw allowed");
	assert(!Ops.empty() && "Cannot get empty mul!");
	if (Ops.size() == 1) return Ops[0];
	#ifndef NDEBUG
	Type *ETy = getEffectiveSCEVType(Ops[0]->getType());
	for (unsigned i = 1, e = Ops.size(); i != e; ++i)
	assert(getEffectiveSCEVType(Ops[i]->getType()) == ETy &&
	"SCEVMulExpr operand types don't match!");
	#endif

	// Sort by complexity, this groups all similar expression types together.
	GroupByComplexity(Ops, &LI, DT);

	Flags = StrengthenNoWrapFlags(this, scMulExpr, Ops, Flags);

	// Limit recursion calls depth.
	if (Depth > MaxArithDepth)
	return getOrCreateMulExpr(Ops, Flags);

	// If there are any constants, fold them together.
	unsigned Idx = 0;
	if (const SCEVConstant *LHSC = dyn_cast<SCEVConstant>(Ops[0])) {

	// C1(C2+V) -> C1C2 + C1*V
	if (Ops.size() == 2)
	if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(Ops[1]))
	// If any of Add's ops are Adds or Muls with a constant,
	// apply this transformation as well.
	if (Add->getNumOperands() == 2)
	if (containsConstantSomewhere(Add))
	return getAddExpr(getMulExpr(LHSC, Add->getOperand(0),
	SCEV::FlagAnyWrap, Depth + 1),
	getMulExpr(LHSC, Add->getOperand(1),
	SCEV::FlagAnyWrap, Depth + 1),
	SCEV::FlagAnyWrap, Depth + 1);

	++Idx;
	while (const SCEVConstant *RHSC = dyn_cast<SCEVConstant>(Ops[Idx])) {
	// We found two constants, fold them together!
	ConstantInt *Fold =
	ConstantInt::get(getContext(), LHSC->getAPInt() * RHSC->getAPInt());
	Ops[0] = getConstant(Fold);
	Ops.erase(Ops.begin()+1); // Erase the folded element
	if (Ops.size() == 1) return Ops[0];
	LHSC = cast<SCEVConstant>(Ops[0]);
	}

	// If we are left with a constant one being multiplied, strip it off.
	if (cast<SCEVConstant>(Ops[0])->getValue()->isOne()) {
	Ops.erase(Ops.begin());
	--Idx;
	} else if (cast<SCEVConstant>(Ops[0])->getValue()->isZero()) {
	// If we have a multiply of zero, it will always be zero.
	return Ops[0];
	} else if (Ops[0]->isAllOnesValue()) {
	// If we have a mul by -1 of an add, try distributing the -1 among the
	// add operands.
	if (Ops.size() == 2) {
	if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(Ops[1])) {
	SmallVector<const SCEV *, 4> NewOps;
	bool AnyFolded = false;
	for (const SCEV *AddOp : Add->operands()) {
	const SCEV *Mul = getMulExpr(Ops[0], AddOp, SCEV::FlagAnyWrap,
	Depth + 1);
	if (!isa<SCEVMulExpr>(Mul)) AnyFolded = true;
	NewOps.push_back(Mul);
	}
	if (AnyFolded)
	return getAddExpr(NewOps, SCEV::FlagAnyWrap, Depth + 1);
	} else if (const auto *AddRec = dyn_cast<SCEVAddRecExpr>(Ops[1])) {
	// Negation preserves a recurrence's no self-wrap property.
	SmallVector<const SCEV *, 4> Operands;
	for (const SCEV *AddRecOp : AddRec->operands())
	Operands.push_back(getMulExpr(Ops[0], AddRecOp, SCEV::FlagAnyWrap,
	Depth + 1));

	return getAddRecExpr(Operands, AddRec->getLoop(),
	AddRec->getNoWrapFlags(SCEV::FlagNW));
	}
	}
	}

	if (Ops.size() == 1)
	return Ops[0];
	}

	// Skip over the add expression until we get to a multiply.
	while (Idx < Ops.size() && Ops[Idx]->getSCEVType() < scMulExpr)
	++Idx;

	// If there are mul operands inline them all into this expression.
	if (Idx < Ops.size()) {
	bool DeletedMul = false;
	while (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(Ops[Idx])) {
	if (Ops.size() > MulOpsInlineThreshold)
	break;
	// If we have an mul, expand the mul operands onto the end of the
	// operands list.
	Ops.erase(Ops.begin()+Idx);
	Ops.append(Mul->op_begin(), Mul->op_end());
	DeletedMul = true;
	}

	// If we deleted at least one mul, we added operands to the end of the
	// list, and they are not necessarily sorted. Recurse to resort and
	// resimplify any operands we just acquired.
	if (DeletedMul)
	return getMulExpr(Ops, SCEV::FlagAnyWrap, Depth + 1);
	}

	// If there are any add recurrences in the operands list, see if any other
	// added values are loop invariant. If so, we can fold them into the
	// recurrence.
	while (Idx < Ops.size() && Ops[Idx]->getSCEVType() < scAddRecExpr)
	++Idx;

	// Scan over all recurrences, trying to fold loop invariants into them.
	for (; Idx < Ops.size() && isa<SCEVAddRecExpr>(Ops[Idx]); ++Idx) {
	// Scan all of the other operands to this mul and add them to the vector
	// if they are loop invariant w.r.t. the recurrence.
	SmallVector<const SCEV *, 8> LIOps;
	const SCEVAddRecExpr *AddRec = cast<SCEVAddRecExpr>(Ops[Idx]);
	const Loop *AddRecLoop = AddRec->getLoop();
	for (unsigned i = 0, e = Ops.size(); i != e; ++i)
	if (isAvailableAtLoopEntry(Ops[i], AddRecLoop)) {
	LIOps.push_back(Ops[i]);
	Ops.erase(Ops.begin()+i);
	--i; --e;
	}

	// If we found some loop invariants, fold them into the recurrence.
	if (!LIOps.empty()) {
	// NLI * LI * {Start,+,Step} --> NLI * {LIStart,+,LIStep}
	SmallVector<const SCEV *, 4> NewOps;
	NewOps.reserve(AddRec->getNumOperands());
	const SCEV *Scale = getMulExpr(LIOps, SCEV::FlagAnyWrap, Depth + 1);
	for (unsigned i = 0, e = AddRec->getNumOperands(); i != e; ++i)
	NewOps.push_back(getMulExpr(Scale, AddRec->getOperand(i),
	SCEV::FlagAnyWrap, Depth + 1));

	// Build the new addrec. Propagate the NUW and NSW flags if both the
	// outer mul and the inner addrec are guaranteed to have no overflow.
	//
	// No self-wrap cannot be guaranteed after changing the step size, but
	// will be inferred if either NUW or NSW is true.
	Flags = AddRec->getNoWrapFlags(clearFlags(Flags, SCEV::FlagNW));
	const SCEV *NewRec = getAddRecExpr(NewOps, AddRecLoop, Flags);

	// If all of the other operands were loop invariant, we are done.
	if (Ops.size() == 1) return NewRec;

	// Otherwise, multiply the folded AddRec by the non-invariant parts.
	for (unsigned i = 0;; ++i)
	if (Ops[i] == AddRec) {
	Ops[i] = NewRec;
	break;
	}
	return getMulExpr(Ops, SCEV::FlagAnyWrap, Depth + 1);
	}

	// Okay, if there weren't any loop invariants to be folded, check to see
	// if there are multiple AddRec's with the same loop induction variable
	// being multiplied together. If so, we can fold them.

	// {A1,+,A2,+,...,+,An}<L> * {B1,+,B2,+,...,+,Bn}<L>
	// = {x=1 in [ sum y=x..2x [ sum z=max(y-x, y-n)..min(x,n) [
	// choose(x, 2x)choose(2x-y, x-z)A_{y-z}*B_z
	// ]]],+,...up to x=2n}.
	// Note that the arguments to choose() are always integers with values
	// known at compile time, never SCEV objects.
	//
	// The implementation avoids pointless extra computations when the two
	// addrec's are of different length (mathematically, it's equivalent to
	// an infinite stream of zeros on the right).
	bool OpsModified = false;
	for (unsigned OtherIdx = Idx+1;
	OtherIdx != Ops.size() && isa<SCEVAddRecExpr>(Ops[OtherIdx]);
	++OtherIdx) {
	const SCEVAddRecExpr *OtherAddRec =
	dyn_cast<SCEVAddRecExpr>(Ops[OtherIdx]);
	if (!OtherAddRec \|\| OtherAddRec->getLoop() != AddRecLoop)
	continue;

	+ // Limit max number of arguments to avoid creation of unreasonably big
	+ // SCEVAddRecs with very complex operands.
	+ if (AddRec->getNumOperands() + OtherAddRec->getNumOperands() - 1 >
	+ MaxAddRecSize)
	+ continue;
	+
	bool Overflow = false;
	Type *Ty = AddRec->getType();
	bool LargerThan64Bits = getTypeSizeInBits(Ty) > 64;
	SmallVector<const SCEV*, 7> AddRecOps;
	for (int x = 0, xe = AddRec->getNumOperands() +
	OtherAddRec->getNumOperands() - 1; x != xe && !Overflow; ++x) {
	const SCEV *Term = getZero(Ty);
	for (int y = x, ye = 2*x+1; y != ye && !Overflow; ++y) {
	uint64_t Coeff1 = Choose(x, 2*x - y, Overflow);
	for (int z = std::max(y-x, y-(int)AddRec->getNumOperands()+1),
	ze = std::min(x+1, (int)OtherAddRec->getNumOperands());
	z < ze && !Overflow; ++z) {
	uint64_t Coeff2 = Choose(2*x - y, x-z, Overflow);
	uint64_t Coeff;
	if (LargerThan64Bits)
	Coeff = umul_ov(Coeff1, Coeff2, Overflow);
	else
	Coeff = Coeff1*Coeff2;
	const SCEV *CoeffTerm = getConstant(Ty, Coeff);
	const SCEV *Term1 = AddRec->getOperand(y-z);
	const SCEV *Term2 = OtherAddRec->getOperand(z);
	Term = getAddExpr(Term, getMulExpr(CoeffTerm, Term1, Term2,
	SCEV::FlagAnyWrap, Depth + 1),
	SCEV::FlagAnyWrap, Depth + 1);
	}
	}
	AddRecOps.push_back(Term);
	}
	if (!Overflow) {
	const SCEV *NewAddRec = getAddRecExpr(AddRecOps, AddRec->getLoop(),
	SCEV::FlagAnyWrap);
	if (Ops.size() == 2) return NewAddRec;
	Ops[Idx] = NewAddRec;
	Ops.erase(Ops.begin() + OtherIdx); --OtherIdx;
	OpsModified = true;
	AddRec = dyn_cast<SCEVAddRecExpr>(NewAddRec);
	if (!AddRec)
	break;
	}
	}
	if (OpsModified)
	return getMulExpr(Ops, SCEV::FlagAnyWrap, Depth + 1);

	// Otherwise couldn't fold anything into this recurrence. Move onto the
	// next one.
	}

	// Okay, it looks like we really DO need an mul expr. Check to see if we
	// already have one, otherwise create a new one.
	return getOrCreateMulExpr(Ops, Flags);
	}

	/// Get a canonical unsigned division expression, or something simpler if
	/// possible.
	const SCEV ScalarEvolution::getUDivExpr(const SCEV LHS,
	const SCEV *RHS) {
	assert(getEffectiveSCEVType(LHS->getType()) ==
	getEffectiveSCEVType(RHS->getType()) &&
	"SCEVUDivExpr operand types don't match!");

	if (const SCEVConstant *RHSC = dyn_cast<SCEVConstant>(RHS)) {
	if (RHSC->getValue()->isOne())
	return LHS; // X udiv 1 --> x
	// If the denominator is zero, the result of the udiv is undefined. Don't
	// try to analyze it, because the resolution chosen here may differ from
	// the resolution chosen in other parts of the compiler.
	if (!RHSC->getValue()->isZero()) {
	// Determine if the division can be folded into the operands of
	// its operands.
	// TODO: Generalize this to non-constants by using known-bits information.
	Type *Ty = LHS->getType();
	unsigned LZ = RHSC->getAPInt().countLeadingZeros();
	unsigned MaxShiftAmt = getTypeSizeInBits(Ty) - LZ - 1;
	// For non-power-of-two values, effectively round the value up to the
	// nearest power of two.
	if (!RHSC->getAPInt().isPowerOf2())
	++MaxShiftAmt;
	IntegerType *ExtTy =
	IntegerType::get(getContext(), getTypeSizeInBits(Ty) + MaxShiftAmt);
	if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHS))
	if (const SCEVConstant *Step =
	dyn_cast<SCEVConstant>(AR->getStepRecurrence(*this))) {
	// {X,+,N}/C --> {X/C,+,N/C} if safe and N/C can be folded.
	const APInt &StepInt = Step->getAPInt();
	const APInt &DivInt = RHSC->getAPInt();
	if (!StepInt.urem(DivInt) &&
	getZeroExtendExpr(AR, ExtTy) ==
	getAddRecExpr(getZeroExtendExpr(AR->getStart(), ExtTy),
	getZeroExtendExpr(Step, ExtTy),
	AR->getLoop(), SCEV::FlagAnyWrap)) {
	SmallVector<const SCEV *, 4> Operands;
	for (const SCEV *Op : AR->operands())
	Operands.push_back(getUDivExpr(Op, RHS));
	return getAddRecExpr(Operands, AR->getLoop(), SCEV::FlagNW);
	}
	/// Get a canonical UDivExpr for a recurrence.
	/// {X,+,N}/C => {Y,+,N}/C where Y=X-(X%N). Safe when C%N=0.
	// We can currently only fold X%N if X is constant.
	const SCEVConstant *StartC = dyn_cast<SCEVConstant>(AR->getStart());
	if (StartC && !DivInt.urem(StepInt) &&
	getZeroExtendExpr(AR, ExtTy) ==
	getAddRecExpr(getZeroExtendExpr(AR->getStart(), ExtTy),
	getZeroExtendExpr(Step, ExtTy),
	AR->getLoop(), SCEV::FlagAnyWrap)) {
	const APInt &StartInt = StartC->getAPInt();
	const APInt &StartRem = StartInt.urem(StepInt);
	if (StartRem != 0)
	LHS = getAddRecExpr(getConstant(StartInt - StartRem), Step,
	AR->getLoop(), SCEV::FlagNW);
	}
	}
	// (AB)/C --> A(B/C) if safe and B/C can be folded.
	if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(LHS)) {
	SmallVector<const SCEV *, 4> Operands;
	for (const SCEV *Op : M->operands())
	Operands.push_back(getZeroExtendExpr(Op, ExtTy));
	if (getZeroExtendExpr(M, ExtTy) == getMulExpr(Operands))
	// Find an operand that's safely divisible.
	for (unsigned i = 0, e = M->getNumOperands(); i != e; ++i) {
	const SCEV *Op = M->getOperand(i);
	const SCEV *Div = getUDivExpr(Op, RHSC);
	if (!isa<SCEVUDivExpr>(Div) && getMulExpr(Div, RHSC) == Op) {
	Operands = SmallVector<const SCEV *, 4>(M->op_begin(),
	M->op_end());
	Operands[i] = Div;
	return getMulExpr(Operands);
	}
	}
	}
	// (A+B)/C --> (A/C + B/C) if safe and A/C and B/C can be folded.
	if (const SCEVAddExpr *A = dyn_cast<SCEVAddExpr>(LHS)) {
	SmallVector<const SCEV *, 4> Operands;
	for (const SCEV *Op : A->operands())
	Operands.push_back(getZeroExtendExpr(Op, ExtTy));
	if (getZeroExtendExpr(A, ExtTy) == getAddExpr(Operands)) {
	Operands.clear();
	for (unsigned i = 0, e = A->getNumOperands(); i != e; ++i) {
	const SCEV *Op = getUDivExpr(A->getOperand(i), RHS);
	if (isa<SCEVUDivExpr>(Op) \|\|
	getMulExpr(Op, RHS) != A->getOperand(i))
	break;
	Operands.push_back(Op);
	}
	if (Operands.size() == A->getNumOperands())
	return getAddExpr(Operands);
	}
	}

	// Fold if both operands are constant.
	if (const SCEVConstant *LHSC = dyn_cast<SCEVConstant>(LHS)) {
	Constant *LHSCV = LHSC->getValue();
	Constant *RHSCV = RHSC->getValue();
	return getConstant(cast<ConstantInt>(ConstantExpr::getUDiv(LHSCV,
	RHSCV)));
	}
	}
	}

	FoldingSetNodeID ID;
	ID.AddInteger(scUDivExpr);
	ID.AddPointer(LHS);
	ID.AddPointer(RHS);
	void *IP = nullptr;
	if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S;
	SCEV *S = new (SCEVAllocator) SCEVUDivExpr(ID.Intern(SCEVAllocator),
	LHS, RHS);
	UniqueSCEVs.InsertNode(S, IP);
	return S;
	}

	static const APInt gcd(const SCEVConstant C1, const SCEVConstant C2) {
	APInt A = C1->getAPInt().abs();
	APInt B = C2->getAPInt().abs();
	uint32_t ABW = A.getBitWidth();
	uint32_t BBW = B.getBitWidth();

	if (ABW > BBW)
	B = B.zext(ABW);
	else if (ABW < BBW)
	A = A.zext(BBW);

	return APIntOps::GreatestCommonDivisor(std::move(A), std::move(B));
	}

	/// Get a canonical unsigned division expression, or something simpler if
	/// possible. There is no representation for an exact udiv in SCEV IR, but we
	/// can attempt to remove factors from the LHS and RHS. We can't do this when
	/// it's not exact because the udiv may be clearing bits.
	const SCEV ScalarEvolution::getUDivExactExpr(const SCEV LHS,
	const SCEV *RHS) {
	// TODO: we could try to find factors in all sorts of things, but for now we
	// just deal with u/exact (multiply, constant). See SCEVDivision towards the
	// end of this file for inspiration.

	const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(LHS);
	if (!Mul \|\| !Mul->hasNoUnsignedWrap())
	return getUDivExpr(LHS, RHS);

	if (const SCEVConstant *RHSCst = dyn_cast<SCEVConstant>(RHS)) {
	// If the mulexpr multiplies by a constant, then that constant must be the
	// first element of the mulexpr.
	if (const auto *LHSCst = dyn_cast<SCEVConstant>(Mul->getOperand(0))) {
	if (LHSCst == RHSCst) {
	SmallVector<const SCEV *, 2> Operands;
	Operands.append(Mul->op_begin() + 1, Mul->op_end());
	return getMulExpr(Operands);
	}

	// We can't just assume that LHSCst divides RHSCst cleanly, it could be
	// that there's a factor provided by one of the other terms. We need to
	// check.
	APInt Factor = gcd(LHSCst, RHSCst);
	if (!Factor.isIntN(1)) {
	LHSCst =
	cast<SCEVConstant>(getConstant(LHSCst->getAPInt().udiv(Factor)));
	RHSCst =
	cast<SCEVConstant>(getConstant(RHSCst->getAPInt().udiv(Factor)));
	SmallVector<const SCEV *, 2> Operands;
	Operands.push_back(LHSCst);
	Operands.append(Mul->op_begin() + 1, Mul->op_end());
	LHS = getMulExpr(Operands);
	RHS = RHSCst;
	Mul = dyn_cast<SCEVMulExpr>(LHS);
	if (!Mul)
	return getUDivExactExpr(LHS, RHS);
	}
	}
	}

	for (int i = 0, e = Mul->getNumOperands(); i != e; ++i) {
	if (Mul->getOperand(i) == RHS) {
	SmallVector<const SCEV *, 2> Operands;
	Operands.append(Mul->op_begin(), Mul->op_begin() + i);
	Operands.append(Mul->op_begin() + i + 1, Mul->op_end());
	return getMulExpr(Operands);
	}
	}

	return getUDivExpr(LHS, RHS);
	}

	/// Get an add recurrence expression for the specified loop. Simplify the
	/// expression as much as possible.
	const SCEV ScalarEvolution::getAddRecExpr(const SCEV Start, const SCEV *Step,
	const Loop *L,
	SCEV::NoWrapFlags Flags) {
	SmallVector<const SCEV *, 4> Operands;
	Operands.push_back(Start);
	if (const SCEVAddRecExpr *StepChrec = dyn_cast<SCEVAddRecExpr>(Step))
	if (StepChrec->getLoop() == L) {
	Operands.append(StepChrec->op_begin(), StepChrec->op_end());
	return getAddRecExpr(Operands, L, maskFlags(Flags, SCEV::FlagNW));
	}

	Operands.push_back(Step);
	return getAddRecExpr(Operands, L, Flags);
	}

	/// Get an add recurrence expression for the specified loop. Simplify the
	/// expression as much as possible.
	const SCEV *
	ScalarEvolution::getAddRecExpr(SmallVectorImpl<const SCEV *> &Operands,
	const Loop *L, SCEV::NoWrapFlags Flags) {
	if (Operands.size() == 1) return Operands[0];
	#ifndef NDEBUG
	Type *ETy = getEffectiveSCEVType(Operands[0]->getType());
	for (unsigned i = 1, e = Operands.size(); i != e; ++i)
	assert(getEffectiveSCEVType(Operands[i]->getType()) == ETy &&
	"SCEVAddRecExpr operand types don't match!");
	for (unsigned i = 0, e = Operands.size(); i != e; ++i)
	assert(isLoopInvariant(Operands[i], L) &&
	"SCEVAddRecExpr operand is not loop-invariant!");
	#endif

	if (Operands.back()->isZero()) {
	Operands.pop_back();
	return getAddRecExpr(Operands, L, SCEV::FlagAnyWrap); // {X,+,0} --> X
	}

	// It's tempting to want to call getMaxBackedgeTakenCount count here and
	// use that information to infer NUW and NSW flags. However, computing a
	// BE count requires calling getAddRecExpr, so we may not yet have a
	// meaningful BE count at this point (and if we don't, we'd be stuck
	// with a SCEVCouldNotCompute as the cached BE count).

	Flags = StrengthenNoWrapFlags(this, scAddRecExpr, Operands, Flags);

	// Canonicalize nested AddRecs in by nesting them in order of loop depth.
	if (const SCEVAddRecExpr *NestedAR = dyn_cast<SCEVAddRecExpr>(Operands[0])) {
	const Loop *NestedLoop = NestedAR->getLoop();
	if (L->contains(NestedLoop)
	? (L->getLoopDepth() < NestedLoop->getLoopDepth())
	: (!NestedLoop->contains(L) &&
	DT.dominates(L->getHeader(), NestedLoop->getHeader()))) {
	SmallVector<const SCEV *, 4> NestedOperands(NestedAR->op_begin(),
	NestedAR->op_end());
	Operands[0] = NestedAR->getStart();
	// AddRecs require their operands be loop-invariant with respect to their
	// loops. Don't perform this transformation if it would break this
	// requirement.
	bool AllInvariant = all_of(
	Operands, [&](const SCEV *Op) { return isLoopInvariant(Op, L); });

	if (AllInvariant) {
	// Create a recurrence for the outer loop with the same step size.
	//
	// The outer recurrence keeps its NW flag but only keeps NUW/NSW if the
	// inner recurrence has the same property.
	SCEV::NoWrapFlags OuterFlags =
	maskFlags(Flags, SCEV::FlagNW \| NestedAR->getNoWrapFlags());

	NestedOperands[0] = getAddRecExpr(Operands, L, OuterFlags);
	AllInvariant = all_of(NestedOperands, [&](const SCEV *Op) {
	return isLoopInvariant(Op, NestedLoop);
	});

	if (AllInvariant) {
	// Ok, both add recurrences are valid after the transformation.
	//
	// The inner recurrence keeps its NW flag but only keeps NUW/NSW if
	// the outer recurrence has the same property.
	SCEV::NoWrapFlags InnerFlags =
	maskFlags(NestedAR->getNoWrapFlags(), SCEV::FlagNW \| Flags);
	return getAddRecExpr(NestedOperands, NestedLoop, InnerFlags);
	}
	}
	// Reset Operands to its original state.
	Operands[0] = NestedAR;
	}
	}

	// Okay, it looks like we really DO need an addrec expr. Check to see if we
	// already have one, otherwise create a new one.
	FoldingSetNodeID ID;
	ID.AddInteger(scAddRecExpr);
	for (unsigned i = 0, e = Operands.size(); i != e; ++i)
	ID.AddPointer(Operands[i]);
	ID.AddPointer(L);
	void *IP = nullptr;
	SCEVAddRecExpr *S =
	static_cast<SCEVAddRecExpr *>(UniqueSCEVs.FindNodeOrInsertPos(ID, IP));
	if (!S) {
	const SCEV *O = SCEVAllocator.Allocate<const SCEV >(Operands.size());
	std::uninitialized_copy(Operands.begin(), Operands.end(), O);
	S = new (SCEVAllocator) SCEVAddRecExpr(ID.Intern(SCEVAllocator),
	O, Operands.size(), L);
	UniqueSCEVs.InsertNode(S, IP);
	}
	S->setNoWrapFlags(Flags);
	return S;
	}

	const SCEV *
	ScalarEvolution::getGEPExpr(GEPOperator *GEP,
	const SmallVectorImpl<const SCEV *> &IndexExprs) {
	const SCEV *BaseExpr = getSCEV(GEP->getPointerOperand());
	// getSCEV(Base)->getType() has the same address space as Base->getType()
	// because SCEV::getType() preserves the address space.
	Type *IntPtrTy = getEffectiveSCEVType(BaseExpr->getType());
	// FIXME(PR23527): Don't blindly transfer the inbounds flag from the GEP
	// instruction to its SCEV, because the Instruction may be guarded by control
	// flow and the no-overflow bits may not be valid for the expression in any
	// context. This can be fixed similarly to how these flags are handled for
	// adds.
	SCEV::NoWrapFlags Wrap = GEP->isInBounds() ? SCEV::FlagNSW
	: SCEV::FlagAnyWrap;

	const SCEV *TotalOffset = getZero(IntPtrTy);
	// The array size is unimportant. The first thing we do on CurTy is getting
	// its element type.
	Type *CurTy = ArrayType::get(GEP->getSourceElementType(), 0);
	for (const SCEV *IndexExpr : IndexExprs) {
	// Compute the (potentially symbolic) offset in bytes for this index.
	if (StructType *STy = dyn_cast<StructType>(CurTy)) {
	// For a struct, add the member offset.
	ConstantInt *Index = cast<SCEVConstant>(IndexExpr)->getValue();
	unsigned FieldNo = Index->getZExtValue();
	const SCEV *FieldOffset = getOffsetOfExpr(IntPtrTy, STy, FieldNo);

	// Add the field offset to the running total offset.
	TotalOffset = getAddExpr(TotalOffset, FieldOffset);

	// Update CurTy to the type of the field at Index.
	CurTy = STy->getTypeAtIndex(Index);
	} else {
	// Update CurTy to its element type.
	CurTy = cast<SequentialType>(CurTy)->getElementType();
	// For an array, add the element offset, explicitly scaled.
	const SCEV *ElementSize = getSizeOfExpr(IntPtrTy, CurTy);
	// Getelementptr indices are signed.
	IndexExpr = getTruncateOrSignExtend(IndexExpr, IntPtrTy);

	// Multiply the index by the element size to compute the element offset.
	const SCEV *LocalOffset = getMulExpr(IndexExpr, ElementSize, Wrap);

	// Add the element offset to the running total offset.
	TotalOffset = getAddExpr(TotalOffset, LocalOffset);
	}
	}

	// Add the total offset from all the GEP indices to the base.
	return getAddExpr(BaseExpr, TotalOffset, Wrap);
	}

	const SCEV ScalarEvolution::getSMaxExpr(const SCEV LHS,
	const SCEV *RHS) {
	SmallVector<const SCEV *, 2> Ops = {LHS, RHS};
	return getSMaxExpr(Ops);
	}

	const SCEV *
	ScalarEvolution::getSMaxExpr(SmallVectorImpl<const SCEV *> &Ops) {
	assert(!Ops.empty() && "Cannot get empty smax!");
	if (Ops.size() == 1) return Ops[0];
	#ifndef NDEBUG
	Type *ETy = getEffectiveSCEVType(Ops[0]->getType());
	for (unsigned i = 1, e = Ops.size(); i != e; ++i)
	assert(getEffectiveSCEVType(Ops[i]->getType()) == ETy &&
	"SCEVSMaxExpr operand types don't match!");
	#endif

	// Sort by complexity, this groups all similar expression types together.
	GroupByComplexity(Ops, &LI, DT);

	// If there are any constants, fold them together.
	unsigned Idx = 0;
	if (const SCEVConstant *LHSC = dyn_cast<SCEVConstant>(Ops[0])) {
	++Idx;
	assert(Idx < Ops.size());
	while (const SCEVConstant *RHSC = dyn_cast<SCEVConstant>(Ops[Idx])) {
	// We found two constants, fold them together!
	ConstantInt *Fold = ConstantInt::get(
	getContext(), APIntOps::smax(LHSC->getAPInt(), RHSC->getAPInt()));
	Ops[0] = getConstant(Fold);
	Ops.erase(Ops.begin()+1); // Erase the folded element
	if (Ops.size() == 1) return Ops[0];
	LHSC = cast<SCEVConstant>(Ops[0]);
	}

	// If we are left with a constant minimum-int, strip it off.
	if (cast<SCEVConstant>(Ops[0])->getValue()->isMinValue(true)) {
	Ops.erase(Ops.begin());
	--Idx;
	} else if (cast<SCEVConstant>(Ops[0])->getValue()->isMaxValue(true)) {
	// If we have an smax with a constant maximum-int, it will always be
	// maximum-int.
	return Ops[0];
	}

	if (Ops.size() == 1) return Ops[0];
	}

	// Find the first SMax
	while (Idx < Ops.size() && Ops[Idx]->getSCEVType() < scSMaxExpr)
	++Idx;

	// Check to see if one of the operands is an SMax. If so, expand its operands
	// onto our operand list, and recurse to simplify.
	if (Idx < Ops.size()) {
	bool DeletedSMax = false;
	while (const SCEVSMaxExpr *SMax = dyn_cast<SCEVSMaxExpr>(Ops[Idx])) {
	Ops.erase(Ops.begin()+Idx);
	Ops.append(SMax->op_begin(), SMax->op_end());
	DeletedSMax = true;
	}

	if (DeletedSMax)
	return getSMaxExpr(Ops);
	}

	// Okay, check to see if the same value occurs in the operand list twice. If
	// so, delete one. Since we sorted the list, these values are required to
	// be adjacent.
	for (unsigned i = 0, e = Ops.size()-1; i != e; ++i)
	// X smax Y smax Y --> X smax Y
	// X smax Y --> X, if X is always greater than Y
	if (Ops[i] == Ops[i+1] \|\|
	isKnownPredicate(ICmpInst::ICMP_SGE, Ops[i], Ops[i+1])) {
	Ops.erase(Ops.begin()+i+1, Ops.begin()+i+2);
	--i; --e;
	} else if (isKnownPredicate(ICmpInst::ICMP_SLE, Ops[i], Ops[i+1])) {
	Ops.erase(Ops.begin()+i, Ops.begin()+i+1);
	--i; --e;
	}

	if (Ops.size() == 1) return Ops[0];

	assert(!Ops.empty() && "Reduced smax down to nothing!");

	// Okay, it looks like we really DO need an smax expr. Check to see if we
	// already have one, otherwise create a new one.
	FoldingSetNodeID ID;
	ID.AddInteger(scSMaxExpr);
	for (unsigned i = 0, e = Ops.size(); i != e; ++i)
	ID.AddPointer(Ops[i]);
	void *IP = nullptr;
	if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S;
	const SCEV *O = SCEVAllocator.Allocate<const SCEV >(Ops.size());
	std::uninitialized_copy(Ops.begin(), Ops.end(), O);
	SCEV *S = new (SCEVAllocator) SCEVSMaxExpr(ID.Intern(SCEVAllocator),
	O, Ops.size());
	UniqueSCEVs.InsertNode(S, IP);
	return S;
	}

	const SCEV ScalarEvolution::getUMaxExpr(const SCEV LHS,
	const SCEV *RHS) {
	SmallVector<const SCEV *, 2> Ops = {LHS, RHS};
	return getUMaxExpr(Ops);
	}

	const SCEV *
	ScalarEvolution::getUMaxExpr(SmallVectorImpl<const SCEV *> &Ops) {
	assert(!Ops.empty() && "Cannot get empty umax!");
	if (Ops.size() == 1) return Ops[0];
	#ifndef NDEBUG
	Type *ETy = getEffectiveSCEVType(Ops[0]->getType());
	for (unsigned i = 1, e = Ops.size(); i != e; ++i)
	assert(getEffectiveSCEVType(Ops[i]->getType()) == ETy &&
	"SCEVUMaxExpr operand types don't match!");
	#endif

	// Sort by complexity, this groups all similar expression types together.
	GroupByComplexity(Ops, &LI, DT);

	// If there are any constants, fold them together.
	unsigned Idx = 0;
	if (const SCEVConstant *LHSC = dyn_cast<SCEVConstant>(Ops[0])) {
	++Idx;
	assert(Idx < Ops.size());
	while (const SCEVConstant *RHSC = dyn_cast<SCEVConstant>(Ops[Idx])) {
	// We found two constants, fold them together!
	ConstantInt *Fold = ConstantInt::get(
	getContext(), APIntOps::umax(LHSC->getAPInt(), RHSC->getAPInt()));
	Ops[0] = getConstant(Fold);
	Ops.erase(Ops.begin()+1); // Erase the folded element
	if (Ops.size() == 1) return Ops[0];
	LHSC = cast<SCEVConstant>(Ops[0]);
	}

	// If we are left with a constant minimum-int, strip it off.
	if (cast<SCEVConstant>(Ops[0])->getValue()->isMinValue(false)) {
	Ops.erase(Ops.begin());
	--Idx;
	} else if (cast<SCEVConstant>(Ops[0])->getValue()->isMaxValue(false)) {
	// If we have an umax with a constant maximum-int, it will always be
	// maximum-int.
	return Ops[0];
	}

	if (Ops.size() == 1) return Ops[0];
	}

	// Find the first UMax
	while (Idx < Ops.size() && Ops[Idx]->getSCEVType() < scUMaxExpr)
	++Idx;

	// Check to see if one of the operands is a UMax. If so, expand its operands
	// onto our operand list, and recurse to simplify.
	if (Idx < Ops.size()) {
	bool DeletedUMax = false;
	while (const SCEVUMaxExpr *UMax = dyn_cast<SCEVUMaxExpr>(Ops[Idx])) {
	Ops.erase(Ops.begin()+Idx);
	Ops.append(UMax->op_begin(), UMax->op_end());
	DeletedUMax = true;
	}

	if (DeletedUMax)
	return getUMaxExpr(Ops);
	}

	// Okay, check to see if the same value occurs in the operand list twice. If
	// so, delete one. Since we sorted the list, these values are required to
	// be adjacent.
	for (unsigned i = 0, e = Ops.size()-1; i != e; ++i)
	// X umax Y umax Y --> X umax Y
	// X umax Y --> X, if X is always greater than Y
	if (Ops[i] == Ops[i+1] \|\|
	isKnownPredicate(ICmpInst::ICMP_UGE, Ops[i], Ops[i+1])) {
	Ops.erase(Ops.begin()+i+1, Ops.begin()+i+2);
	--i; --e;
	} else if (isKnownPredicate(ICmpInst::ICMP_ULE, Ops[i], Ops[i+1])) {
	Ops.erase(Ops.begin()+i, Ops.begin()+i+1);
	--i; --e;
	}

	if (Ops.size() == 1) return Ops[0];

	assert(!Ops.empty() && "Reduced umax down to nothing!");

	// Okay, it looks like we really DO need a umax expr. Check to see if we
	// already have one, otherwise create a new one.
	FoldingSetNodeID ID;
	ID.AddInteger(scUMaxExpr);
	for (unsigned i = 0, e = Ops.size(); i != e; ++i)
	ID.AddPointer(Ops[i]);
	void *IP = nullptr;
	if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S;
	const SCEV *O = SCEVAllocator.Allocate<const SCEV >(Ops.size());
	std::uninitialized_copy(Ops.begin(), Ops.end(), O);
	SCEV *S = new (SCEVAllocator) SCEVUMaxExpr(ID.Intern(SCEVAllocator),
	O, Ops.size());
	UniqueSCEVs.InsertNode(S, IP);
	return S;
	}

	const SCEV ScalarEvolution::getSMinExpr(const SCEV LHS,
	const SCEV *RHS) {
	// ~smax(~x, ~y) == smin(x, y).
	return getNotSCEV(getSMaxExpr(getNotSCEV(LHS), getNotSCEV(RHS)));
	}

	const SCEV ScalarEvolution::getUMinExpr(const SCEV LHS,
	const SCEV *RHS) {
	// ~umax(~x, ~y) == umin(x, y)
	return getNotSCEV(getUMaxExpr(getNotSCEV(LHS), getNotSCEV(RHS)));
	}

	const SCEV ScalarEvolution::getSizeOfExpr(Type IntTy, Type *AllocTy) {
	// We can bypass creating a target-independent
	// constant expression and then folding it back into a ConstantInt.
	// This is just a compile-time optimization.
	return getConstant(IntTy, getDataLayout().getTypeAllocSize(AllocTy));
	}

	const SCEV ScalarEvolution::getOffsetOfExpr(Type IntTy,
	StructType *STy,
	unsigned FieldNo) {
	// We can bypass creating a target-independent
	// constant expression and then folding it back into a ConstantInt.
	// This is just a compile-time optimization.
	return getConstant(
	IntTy, getDataLayout().getStructLayout(STy)->getElementOffset(FieldNo));
	}

	const SCEV ScalarEvolution::getUnknown(Value V) {
	// Don't attempt to do anything other than create a SCEVUnknown object
	// here. createSCEV only calls getUnknown after checking for all other
	// interesting possibilities, and any other code that calls getUnknown
	// is doing so in order to hide a value from SCEV canonicalization.

	FoldingSetNodeID ID;
	ID.AddInteger(scUnknown);
	ID.AddPointer(V);
	void *IP = nullptr;
	if (SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) {
	assert(cast<SCEVUnknown>(S)->getValue() == V &&
	"Stale SCEVUnknown in uniquing map!");
	return S;
	}
	SCEV *S = new (SCEVAllocator) SCEVUnknown(ID.Intern(SCEVAllocator), V, this,
	FirstUnknown);
	FirstUnknown = cast<SCEVUnknown>(S);
	UniqueSCEVs.InsertNode(S, IP);
	return S;
	}

	//===----------------------------------------------------------------------===//
	// Basic SCEV Analysis and PHI Idiom Recognition Code
	//

	/// Test if values of the given type are analyzable within the SCEV
	/// framework. This primarily includes integer types, and it can optionally
	/// include pointer types if the ScalarEvolution class has access to
	/// target-specific information.
	bool ScalarEvolution::isSCEVable(Type *Ty) const {
	// Integers and pointers are always SCEVable.
	return Ty->isIntegerTy() \|\| Ty->isPointerTy();
	}

	/// Return the size in bits of the specified type, for which isSCEVable must
	/// return true.
	uint64_t ScalarEvolution::getTypeSizeInBits(Type *Ty) const {
	assert(isSCEVable(Ty) && "Type is not SCEVable!");
	return getDataLayout().getTypeSizeInBits(Ty);
	}

	/// Return a type with the same bitwidth as the given type and which represents
	/// how SCEV will treat the given type, for which isSCEVable must return
	/// true. For pointer types, this is the pointer-sized integer type.
	Type ScalarEvolution::getEffectiveSCEVType(Type Ty) const {
	assert(isSCEVable(Ty) && "Type is not SCEVable!");

	if (Ty->isIntegerTy())
	return Ty;

	// The only other support type is pointer.
	assert(Ty->isPointerTy() && "Unexpected non-pointer non-integer type!");
	return getDataLayout().getIntPtrType(Ty);
	}

	Type ScalarEvolution::getWiderType(Type T1, Type *T2) const {
	return getTypeSizeInBits(T1) >= getTypeSizeInBits(T2) ? T1 : T2;
	}

	const SCEV *ScalarEvolution::getCouldNotCompute() {
	return CouldNotCompute.get();
	}

	bool ScalarEvolution::checkValidity(const SCEV *S) const {
	bool ContainsNulls = SCEVExprContains(S, [](const SCEV *S) {
	auto *SU = dyn_cast<SCEVUnknown>(S);
	return SU && SU->getValue() == nullptr;
	});

	return !ContainsNulls;
	}

	bool ScalarEvolution::containsAddRecurrence(const SCEV *S) {
	HasRecMapType::iterator I = HasRecMap.find(S);
	if (I != HasRecMap.end())
	return I->second;

	bool FoundAddRec = SCEVExprContains(S, isa<SCEVAddRecExpr, const SCEV *>);
	HasRecMap.insert({S, FoundAddRec});
	return FoundAddRec;
	}

	/// Try to split a SCEVAddExpr into a pair of {SCEV, ConstantInt}.
	/// If \p S is a SCEVAddExpr and is composed of a sub SCEV S' and an
	/// offset I, then return {S', I}, else return {\p S, nullptr}.
	static std::pair<const SCEV , ConstantInt > splitAddExpr(const SCEV *S) {
	const auto *Add = dyn_cast<SCEVAddExpr>(S);
	if (!Add)
	return {S, nullptr};

	if (Add->getNumOperands() != 2)
	return {S, nullptr};

	auto *ConstOp = dyn_cast<SCEVConstant>(Add->getOperand(0));
	if (!ConstOp)
	return {S, nullptr};

	return {Add->getOperand(1), ConstOp->getValue()};
	}

	/// Return the ValueOffsetPair set for \p S. \p S can be represented
	/// by the value and offset from any ValueOffsetPair in the set.
	SetVector<ScalarEvolution::ValueOffsetPair> *
	ScalarEvolution::getSCEVValues(const SCEV *S) {
	ExprValueMapType::iterator SI = ExprValueMap.find_as(S);
	if (SI == ExprValueMap.end())
	return nullptr;
	#ifndef NDEBUG
	if (VerifySCEVMap) {
	// Check there is no dangling Value in the set returned.
	for (const auto &VE : SI->second)
	assert(ValueExprMap.count(VE.first));
	}
	#endif
	return &SI->second;
	}

	/// Erase Value from ValueExprMap and ExprValueMap. ValueExprMap.erase(V)
	/// cannot be used separately. eraseValueFromMap should be used to remove
	/// V from ValueExprMap and ExprValueMap at the same time.
	void ScalarEvolution::eraseValueFromMap(Value *V) {
	ValueExprMapType::iterator I = ValueExprMap.find_as(V);
	if (I != ValueExprMap.end()) {
	const SCEV *S = I->second;
	// Remove {V, 0} from the set of ExprValueMap[S]
	if (SetVector<ValueOffsetPair> *SV = getSCEVValues(S))
	SV->remove({V, nullptr});

	// Remove {V, Offset} from the set of ExprValueMap[Stripped]
	const SCEV *Stripped;
	ConstantInt *Offset;
	std::tie(Stripped, Offset) = splitAddExpr(S);
	if (Offset != nullptr) {
	if (SetVector<ValueOffsetPair> *SV = getSCEVValues(Stripped))
	SV->remove({V, Offset});
	}
	ValueExprMap.erase(V);
	}
	}

	/// Return an existing SCEV if it exists, otherwise analyze the expression and
	/// create a new one.
	const SCEV ScalarEvolution::getSCEV(Value V) {
	assert(isSCEVable(V->getType()) && "Value is not SCEVable!");

	const SCEV *S = getExistingSCEV(V);
	if (S == nullptr) {
	S = createSCEV(V);
	// During PHI resolution, it is possible to create two SCEVs for the same
	// V, so it is needed to double check whether V->S is inserted into
	// ValueExprMap before insert S->{V, 0} into ExprValueMap.
	std::pair<ValueExprMapType::iterator, bool> Pair =
	ValueExprMap.insert({SCEVCallbackVH(V, this), S});
	if (Pair.second) {
	ExprValueMap[S].insert({V, nullptr});

	// If S == Stripped + Offset, add Stripped -> {V, Offset} into
	// ExprValueMap.
	const SCEV *Stripped = S;
	ConstantInt *Offset = nullptr;
	std::tie(Stripped, Offset) = splitAddExpr(S);
	// If stripped is SCEVUnknown, don't bother to save
	// Stripped -> {V, offset}. It doesn't simplify and sometimes even
	// increase the complexity of the expansion code.
	// If V is GetElementPtrInst, don't save Stripped -> {V, offset}
	// because it may generate add/sub instead of GEP in SCEV expansion.
	if (Offset != nullptr && !isa<SCEVUnknown>(Stripped) &&
	!isa<GetElementPtrInst>(V))
	ExprValueMap[Stripped].insert({V, Offset});
	}
	}
	return S;
	}

	const SCEV ScalarEvolution::getExistingSCEV(Value V) {
	assert(isSCEVable(V->getType()) && "Value is not SCEVable!");

	ValueExprMapType::iterator I = ValueExprMap.find_as(V);
	if (I != ValueExprMap.end()) {
	const SCEV *S = I->second;
	if (checkValidity(S))
	return S;
	eraseValueFromMap(V);
	forgetMemoizedResults(S);
	}
	return nullptr;
	}

	/// Return a SCEV corresponding to -V = -1*V
	///
	const SCEV ScalarEvolution::getNegativeSCEV(const SCEV V,
	SCEV::NoWrapFlags Flags) {
	if (const SCEVConstant *VC = dyn_cast<SCEVConstant>(V))
	return getConstant(
	cast<ConstantInt>(ConstantExpr::getNeg(VC->getValue())));

	Type *Ty = V->getType();
	Ty = getEffectiveSCEVType(Ty);
	return getMulExpr(
	V, getConstant(cast<ConstantInt>(Constant::getAllOnesValue(Ty))), Flags);
	}

	/// Return a SCEV corresponding to ~V = -1-V
	const SCEV ScalarEvolution::getNotSCEV(const SCEV V) {
	if (const SCEVConstant *VC = dyn_cast<SCEVConstant>(V))
	return getConstant(
	cast<ConstantInt>(ConstantExpr::getNot(VC->getValue())));

	Type *Ty = V->getType();
	Ty = getEffectiveSCEVType(Ty);
	const SCEV *AllOnes =
	getConstant(cast<ConstantInt>(Constant::getAllOnesValue(Ty)));
	return getMinusSCEV(AllOnes, V);
	}

	const SCEV ScalarEvolution::getMinusSCEV(const SCEV LHS, const SCEV *RHS,
	SCEV::NoWrapFlags Flags,
	unsigned Depth) {
	// Fast path: X - X --> 0.
	if (LHS == RHS)
	return getZero(LHS->getType());

	// We represent LHS - RHS as LHS + (-1)*RHS. This transformation
	// makes it so that we cannot make much use of NUW.
	auto AddFlags = SCEV::FlagAnyWrap;
	const bool RHSIsNotMinSigned =
	!getSignedRangeMin(RHS).isMinSignedValue();
	if (maskFlags(Flags, SCEV::FlagNSW) == SCEV::FlagNSW) {
	// Let M be the minimum representable signed value. Then (-1)*RHS
	// signed-wraps if and only if RHS is M. That can happen even for
	// a NSW subtraction because e.g. (-1)*M signed-wraps even though
	// -1 - M does not. So to transfer NSW from LHS - RHS to LHS +
	// (-1)*RHS, we need to prove that RHS != M.
	//
	// If LHS is non-negative and we know that LHS - RHS does not
	// signed-wrap, then RHS cannot be M. So we can rule out signed-wrap
	// either by proving that RHS > M or that LHS >= 0.
	if (RHSIsNotMinSigned \|\| isKnownNonNegative(LHS)) {
	AddFlags = SCEV::FlagNSW;
	}
	}

	// FIXME: Find a correct way to transfer NSW to (-1)*M when LHS -
	// RHS is NSW and LHS >= 0.
	//
	// The difficulty here is that the NSW flag may have been proven
	// relative to a loop that is to be found in a recurrence in LHS and
	// not in RHS. Applying NSW to (-1)*M may then let the NSW have a
	// larger scope than intended.
	auto NegFlags = RHSIsNotMinSigned ? SCEV::FlagNSW : SCEV::FlagAnyWrap;

	return getAddExpr(LHS, getNegativeSCEV(RHS, NegFlags), AddFlags, Depth);
	}

	const SCEV *
	ScalarEvolution::getTruncateOrZeroExtend(const SCEV V, Type Ty) {
	Type *SrcTy = V->getType();
	assert((SrcTy->isIntegerTy() \|\| SrcTy->isPointerTy()) &&
	(Ty->isIntegerTy() \|\| Ty->isPointerTy()) &&
	"Cannot truncate or zero extend with non-integer arguments!");
	if (getTypeSizeInBits(SrcTy) == getTypeSizeInBits(Ty))
	return V; // No conversion
	if (getTypeSizeInBits(SrcTy) > getTypeSizeInBits(Ty))
	return getTruncateExpr(V, Ty);
	return getZeroExtendExpr(V, Ty);
	}

	const SCEV *
	ScalarEvolution::getTruncateOrSignExtend(const SCEV *V,
	Type *Ty) {
	Type *SrcTy = V->getType();
	assert((SrcTy->isIntegerTy() \|\| SrcTy->isPointerTy()) &&
	(Ty->isIntegerTy() \|\| Ty->isPointerTy()) &&
	"Cannot truncate or zero extend with non-integer arguments!");
	if (getTypeSizeInBits(SrcTy) == getTypeSizeInBits(Ty))
	return V; // No conversion
	if (getTypeSizeInBits(SrcTy) > getTypeSizeInBits(Ty))
	return getTruncateExpr(V, Ty);
	return getSignExtendExpr(V, Ty);
	}

	const SCEV *
	ScalarEvolution::getNoopOrZeroExtend(const SCEV V, Type Ty) {
	Type *SrcTy = V->getType();
	assert((SrcTy->isIntegerTy() \|\| SrcTy->isPointerTy()) &&
	(Ty->isIntegerTy() \|\| Ty->isPointerTy()) &&
	"Cannot noop or zero extend with non-integer arguments!");
	assert(getTypeSizeInBits(SrcTy) <= getTypeSizeInBits(Ty) &&
	"getNoopOrZeroExtend cannot truncate!");
	if (getTypeSizeInBits(SrcTy) == getTypeSizeInBits(Ty))
	return V; // No conversion
	return getZeroExtendExpr(V, Ty);
	}

	const SCEV *
	ScalarEvolution::getNoopOrSignExtend(const SCEV V, Type Ty) {
	Type *SrcTy = V->getType();
	assert((SrcTy->isIntegerTy() \|\| SrcTy->isPointerTy()) &&
	(Ty->isIntegerTy() \|\| Ty->isPointerTy()) &&
	"Cannot noop or sign extend with non-integer arguments!");
	assert(getTypeSizeInBits(SrcTy) <= getTypeSizeInBits(Ty) &&
	"getNoopOrSignExtend cannot truncate!");
	if (getTypeSizeInBits(SrcTy) == getTypeSizeInBits(Ty))
	return V; // No conversion
	return getSignExtendExpr(V, Ty);
	}

	const SCEV *
	ScalarEvolution::getNoopOrAnyExtend(const SCEV V, Type Ty) {
	Type *SrcTy = V->getType();
	assert((SrcTy->isIntegerTy() \|\| SrcTy->isPointerTy()) &&
	(Ty->isIntegerTy() \|\| Ty->isPointerTy()) &&
	"Cannot noop or any extend with non-integer arguments!");
	assert(getTypeSizeInBits(SrcTy) <= getTypeSizeInBits(Ty) &&
	"getNoopOrAnyExtend cannot truncate!");
	if (getTypeSizeInBits(SrcTy) == getTypeSizeInBits(Ty))
	return V; // No conversion
	return getAnyExtendExpr(V, Ty);
	}

	const SCEV *
	ScalarEvolution::getTruncateOrNoop(const SCEV V, Type Ty) {
	Type *SrcTy = V->getType();
	assert((SrcTy->isIntegerTy() \|\| SrcTy->isPointerTy()) &&
	(Ty->isIntegerTy() \|\| Ty->isPointerTy()) &&
	"Cannot truncate or noop with non-integer arguments!");
	assert(getTypeSizeInBits(SrcTy) >= getTypeSizeInBits(Ty) &&
	"getTruncateOrNoop cannot extend!");
	if (getTypeSizeInBits(SrcTy) == getTypeSizeInBits(Ty))
	return V; // No conversion
	return getTruncateExpr(V, Ty);
	}

	const SCEV ScalarEvolution::getUMaxFromMismatchedTypes(const SCEV LHS,
	const SCEV *RHS) {
	const SCEV *PromotedLHS = LHS;
	const SCEV *PromotedRHS = RHS;

	if (getTypeSizeInBits(LHS->getType()) > getTypeSizeInBits(RHS->getType()))
	PromotedRHS = getZeroExtendExpr(RHS, LHS->getType());
	else
	PromotedLHS = getNoopOrZeroExtend(LHS, RHS->getType());

	return getUMaxExpr(PromotedLHS, PromotedRHS);
	}

	const SCEV ScalarEvolution::getUMinFromMismatchedTypes(const SCEV LHS,
	const SCEV *RHS) {
	const SCEV *PromotedLHS = LHS;
	const SCEV *PromotedRHS = RHS;

	if (getTypeSizeInBits(LHS->getType()) > getTypeSizeInBits(RHS->getType()))
	PromotedRHS = getZeroExtendExpr(RHS, LHS->getType());
	else
	PromotedLHS = getNoopOrZeroExtend(LHS, RHS->getType());

	return getUMinExpr(PromotedLHS, PromotedRHS);
	}

	const SCEV ScalarEvolution::getPointerBase(const SCEV V) {
	// A pointer operand may evaluate to a nonpointer expression, such as null.
	if (!V->getType()->isPointerTy())
	return V;

	if (const SCEVCastExpr *Cast = dyn_cast<SCEVCastExpr>(V)) {
	return getPointerBase(Cast->getOperand());
	} else if (const SCEVNAryExpr *NAry = dyn_cast<SCEVNAryExpr>(V)) {
	const SCEV *PtrOp = nullptr;
	for (const SCEV *NAryOp : NAry->operands()) {
	if (NAryOp->getType()->isPointerTy()) {
	// Cannot find the base of an expression with multiple pointer operands.
	if (PtrOp)
	return V;
	PtrOp = NAryOp;
	}
	}
	if (!PtrOp)
	return V;
	return getPointerBase(PtrOp);
	}
	return V;
	}

	/// Push users of the given Instruction onto the given Worklist.
	static void
	PushDefUseChildren(Instruction *I,
	SmallVectorImpl<Instruction *> &Worklist) {
	// Push the def-use children onto the Worklist stack.
	for (User *U : I->users())
	Worklist.push_back(cast<Instruction>(U));
	}

	void ScalarEvolution::forgetSymbolicName(Instruction PN, const SCEV SymName) {
	SmallVector<Instruction *, 16> Worklist;
	PushDefUseChildren(PN, Worklist);

	SmallPtrSet<Instruction *, 8> Visited;
	Visited.insert(PN);
	while (!Worklist.empty()) {
	Instruction *I = Worklist.pop_back_val();
	if (!Visited.insert(I).second)
	continue;

	auto It = ValueExprMap.find_as(static_cast<Value *>(I));
	if (It != ValueExprMap.end()) {
	const SCEV *Old = It->second;

	// Short-circuit the def-use traversal if the symbolic name
	// ceases to appear in expressions.
	if (Old != SymName && !hasOperand(Old, SymName))
	continue;

	// SCEVUnknown for a PHI either means that it has an unrecognized
	// structure, it's a PHI that's in the progress of being computed
	// by createNodeForPHI, or it's a single-value PHI. In the first case,
	// additional loop trip count information isn't going to change anything.
	// In the second case, createNodeForPHI will perform the necessary
	// updates on its own when it gets to that point. In the third, we do
	// want to forget the SCEVUnknown.
	if (!isa<PHINode>(I) \|\|
	!isa<SCEVUnknown>(Old) \|\|
	(I != PN && Old == SymName)) {
	eraseValueFromMap(It->first);
	forgetMemoizedResults(Old);
	}
	}

	PushDefUseChildren(I, Worklist);
	}
	}

	namespace {
	class SCEVInitRewriter : public SCEVRewriteVisitor<SCEVInitRewriter> {
	public:
	static const SCEV rewrite(const SCEV S, const Loop *L,
	ScalarEvolution &SE) {
	SCEVInitRewriter Rewriter(L, SE);
	const SCEV *Result = Rewriter.visit(S);
	return Rewriter.isValid() ? Result : SE.getCouldNotCompute();
	}

	SCEVInitRewriter(const Loop *L, ScalarEvolution &SE)
	: SCEVRewriteVisitor(SE), L(L), Valid(true) {}

	const SCEV visitUnknown(const SCEVUnknown Expr) {
	if (!SE.isLoopInvariant(Expr, L))
	Valid = false;
	return Expr;
	}

	const SCEV visitAddRecExpr(const SCEVAddRecExpr Expr) {
	// Only allow AddRecExprs for this loop.
	if (Expr->getLoop() == L)
	return Expr->getStart();
	Valid = false;
	return Expr;
	}

	bool isValid() { return Valid; }

	private:
	const Loop *L;
	bool Valid;
	};

	class SCEVShiftRewriter : public SCEVRewriteVisitor<SCEVShiftRewriter> {
	public:
	static const SCEV rewrite(const SCEV S, const Loop *L,
	ScalarEvolution &SE) {
	SCEVShiftRewriter Rewriter(L, SE);
	const SCEV *Result = Rewriter.visit(S);
	return Rewriter.isValid() ? Result : SE.getCouldNotCompute();
	}

	SCEVShiftRewriter(const Loop *L, ScalarEvolution &SE)
	: SCEVRewriteVisitor(SE), L(L), Valid(true) {}

	const SCEV visitUnknown(const SCEVUnknown Expr) {
	// Only allow AddRecExprs for this loop.
	if (!SE.isLoopInvariant(Expr, L))
	Valid = false;
	return Expr;
	}

	const SCEV visitAddRecExpr(const SCEVAddRecExpr Expr) {
	if (Expr->getLoop() == L && Expr->isAffine())
	return SE.getMinusSCEV(Expr, Expr->getStepRecurrence(SE));
	Valid = false;
	return Expr;
	}
	bool isValid() { return Valid; }

	private:
	const Loop *L;
	bool Valid;
	};
	} // end anonymous namespace

	SCEV::NoWrapFlags
	ScalarEvolution::proveNoWrapViaConstantRanges(const SCEVAddRecExpr *AR) {
	if (!AR->isAffine())
	return SCEV::FlagAnyWrap;

	typedef OverflowingBinaryOperator OBO;
	SCEV::NoWrapFlags Result = SCEV::FlagAnyWrap;

	if (!AR->hasNoSignedWrap()) {
	ConstantRange AddRecRange = getSignedRange(AR);
	ConstantRange IncRange = getSignedRange(AR->getStepRecurrence(*this));

	auto NSWRegion = ConstantRange::makeGuaranteedNoWrapRegion(
	Instruction::Add, IncRange, OBO::NoSignedWrap);
	if (NSWRegion.contains(AddRecRange))
	Result = ScalarEvolution::setFlags(Result, SCEV::FlagNSW);
	}

	if (!AR->hasNoUnsignedWrap()) {
	ConstantRange AddRecRange = getUnsignedRange(AR);
	ConstantRange IncRange = getUnsignedRange(AR->getStepRecurrence(*this));

	auto NUWRegion = ConstantRange::makeGuaranteedNoWrapRegion(
	Instruction::Add, IncRange, OBO::NoUnsignedWrap);
	if (NUWRegion.contains(AddRecRange))
	Result = ScalarEvolution::setFlags(Result, SCEV::FlagNUW);
	}

	return Result;
	}

	namespace {
	/// Represents an abstract binary operation. This may exist as a
	/// normal instruction or constant expression, or may have been
	/// derived from an expression tree.
	struct BinaryOp {
	unsigned Opcode;
	Value *LHS;
	Value *RHS;
	bool IsNSW;
	bool IsNUW;

	/// Op is set if this BinaryOp corresponds to a concrete LLVM instruction or
	/// constant expression.
	Operator *Op;

	explicit BinaryOp(Operator *Op)
	: Opcode(Op->getOpcode()), LHS(Op->getOperand(0)), RHS(Op->getOperand(1)),
	IsNSW(false), IsNUW(false), Op(Op) {
	if (auto *OBO = dyn_cast<OverflowingBinaryOperator>(Op)) {
	IsNSW = OBO->hasNoSignedWrap();
	IsNUW = OBO->hasNoUnsignedWrap();
	}
	}

	explicit BinaryOp(unsigned Opcode, Value LHS, Value RHS, bool IsNSW = false,
	bool IsNUW = false)
	: Opcode(Opcode), LHS(LHS), RHS(RHS), IsNSW(IsNSW), IsNUW(IsNUW),
	Op(nullptr) {}
	};
	}


	/// Try to map \p V into a BinaryOp, and return \c None on failure.
	static Optional<BinaryOp> MatchBinaryOp(Value *V, DominatorTree &DT) {
	auto *Op = dyn_cast<Operator>(V);
	if (!Op)
	return None;

	// Implementation detail: all the cleverness here should happen without
	// creating new SCEV expressions -- our caller knowns tricks to avoid creating
	// SCEV expressions when possible, and we should not break that.

	switch (Op->getOpcode()) {
	case Instruction::Add:
	case Instruction::Sub:
	case Instruction::Mul:
	case Instruction::UDiv:
	case Instruction::And:
	case Instruction::Or:
	case Instruction::AShr:
	case Instruction::Shl:
	return BinaryOp(Op);

	case Instruction::Xor:
	if (auto *RHSC = dyn_cast<ConstantInt>(Op->getOperand(1)))
	// If the RHS of the xor is a signmask, then this is just an add.
	// Instcombine turns add of signmask into xor as a strength reduction step.
	if (RHSC->getValue().isSignMask())
	return BinaryOp(Instruction::Add, Op->getOperand(0), Op->getOperand(1));
	return BinaryOp(Op);

	case Instruction::LShr:
	// Turn logical shift right of a constant into a unsigned divide.
	if (ConstantInt *SA = dyn_cast<ConstantInt>(Op->getOperand(1))) {
	uint32_t BitWidth = cast<IntegerType>(Op->getType())->getBitWidth();

	// If the shift count is not less than the bitwidth, the result of
	// the shift is undefined. Don't try to analyze it, because the
	// resolution chosen here may differ from the resolution chosen in
	// other parts of the compiler.
	if (SA->getValue().ult(BitWidth)) {
	Constant *X =
	ConstantInt::get(SA->getContext(),
	APInt::getOneBitSet(BitWidth, SA->getZExtValue()));
	return BinaryOp(Instruction::UDiv, Op->getOperand(0), X);
	}
	}
	return BinaryOp(Op);

	case Instruction::ExtractValue: {
	auto *EVI = cast<ExtractValueInst>(Op);
	if (EVI->getNumIndices() != 1 \|\| EVI->getIndices()[0] != 0)
	break;

	auto *CI = dyn_cast<CallInst>(EVI->getAggregateOperand());
	if (!CI)
	break;

	if (auto *F = CI->getCalledFunction())
	switch (F->getIntrinsicID()) {
	case Intrinsic::sadd_with_overflow:
	case Intrinsic::uadd_with_overflow: {
	if (!isOverflowIntrinsicNoWrap(cast<IntrinsicInst>(CI), DT))
	return BinaryOp(Instruction::Add, CI->getArgOperand(0),
	CI->getArgOperand(1));

	// Now that we know that all uses of the arithmetic-result component of
	// CI are guarded by the overflow check, we can go ahead and pretend
	// that the arithmetic is non-overflowing.
	if (F->getIntrinsicID() == Intrinsic::sadd_with_overflow)
	return BinaryOp(Instruction::Add, CI->getArgOperand(0),
	CI->getArgOperand(1), /* IsNSW = */ true,
	/* IsNUW = */ false);
	else
	return BinaryOp(Instruction::Add, CI->getArgOperand(0),
	CI->getArgOperand(1), /* IsNSW = */ false,
	/* IsNUW*/ true);
	}

	case Intrinsic::ssub_with_overflow:
	case Intrinsic::usub_with_overflow:
	return BinaryOp(Instruction::Sub, CI->getArgOperand(0),
	CI->getArgOperand(1));

	case Intrinsic::smul_with_overflow:
	case Intrinsic::umul_with_overflow:
	return BinaryOp(Instruction::Mul, CI->getArgOperand(0),
	CI->getArgOperand(1));
	default:
	break;
	}
	}

	default:
	break;
	}

	return None;
	}

	/// Helper function to createAddRecFromPHIWithCasts. We have a phi
	/// node whose symbolic (unknown) SCEV is \p SymbolicPHI, which is updated via
	/// the loop backedge by a SCEVAddExpr, possibly also with a few casts on the
	/// way. This function checks if \p Op, an operand of this SCEVAddExpr,
	/// follows one of the following patterns:
	/// Op == (SExt ix (Trunc iy (%SymbolicPHI) to ix) to iy)
	/// Op == (ZExt ix (Trunc iy (%SymbolicPHI) to ix) to iy)
	/// If the SCEV expression of \p Op conforms with one of the expected patterns
	/// we return the type of the truncation operation, and indicate whether the
	/// truncated type should be treated as signed/unsigned by setting
	/// \p Signed to true/false, respectively.
	static Type isSimpleCastedPHI(const SCEV Op, const SCEVUnknown *SymbolicPHI,
	bool &Signed, ScalarEvolution &SE) {

	// The case where Op == SymbolicPHI (that is, with no type conversions on
	// the way) is handled by the regular add recurrence creating logic and
	// would have already been triggered in createAddRecForPHI. Reaching it here
	// means that createAddRecFromPHI had failed for this PHI before (e.g.,
	// because one of the other operands of the SCEVAddExpr updating this PHI is
	// not invariant).
	//
	// Here we look for the case where Op = (ext(trunc(SymbolicPHI))), and in
	// this case predicates that allow us to prove that Op == SymbolicPHI will
	// be added.
	if (Op == SymbolicPHI)
	return nullptr;

	unsigned SourceBits = SE.getTypeSizeInBits(SymbolicPHI->getType());
	unsigned NewBits = SE.getTypeSizeInBits(Op->getType());
	if (SourceBits != NewBits)
	return nullptr;

	const SCEVSignExtendExpr *SExt = dyn_cast<SCEVSignExtendExpr>(Op);
	const SCEVZeroExtendExpr *ZExt = dyn_cast<SCEVZeroExtendExpr>(Op);
	if (!SExt && !ZExt)
	return nullptr;
	const SCEVTruncateExpr *Trunc =
	SExt ? dyn_cast<SCEVTruncateExpr>(SExt->getOperand())
	: dyn_cast<SCEVTruncateExpr>(ZExt->getOperand());
	if (!Trunc)
	return nullptr;
	const SCEV *X = Trunc->getOperand();
	if (X != SymbolicPHI)
	return nullptr;
	Signed = SExt ? true : false;
	return Trunc->getType();
	}

	static const Loop isIntegerLoopHeaderPHI(const PHINode PN, LoopInfo &LI) {
	if (!PN->getType()->isIntegerTy())
	return nullptr;
	const Loop *L = LI.getLoopFor(PN->getParent());
	if (!L \|\| L->getHeader() != PN->getParent())
	return nullptr;
	return L;
	}

	// Analyze \p SymbolicPHI, a SCEV expression of a phi node, and check if the
	// computation that updates the phi follows the following pattern:
	// (SExt/ZExt ix (Trunc iy (%SymbolicPHI) to ix) to iy) + InvariantAccum
	// which correspond to a phi->trunc->sext/zext->add->phi update chain.
	// If so, try to see if it can be rewritten as an AddRecExpr under some
	// Predicates. If successful, return them as a pair. Also cache the results
	// of the analysis.
	//
	// Example usage scenario:
	// Say the Rewriter is called for the following SCEV:
	// 8 * ((sext i32 (trunc i64 %X to i32) to i64) + %Step)
	// where:
	// %X = phi i64 (%Start, %BEValue)
	// It will visitMul->visitAdd->visitSExt->visitTrunc->visitUnknown(%X),
	// and call this function with %SymbolicPHI = %X.
	//
	// The analysis will find that the value coming around the backedge has
	// the following SCEV:
	// BEValue = ((sext i32 (trunc i64 %X to i32) to i64) + %Step)
	// Upon concluding that this matches the desired pattern, the function
	// will return the pair {NewAddRec, SmallPredsVec} where:
	// NewAddRec = {%Start,+,%Step}
	// SmallPredsVec = {P1, P2, P3} as follows:
	// P1(WrapPred): AR: {trunc(%Start),+,(trunc %Step)}<nsw> Flags: <nssw>
	// P2(EqualPred): %Start == (sext i32 (trunc i64 %Start to i32) to i64)
	// P3(EqualPred): %Step == (sext i32 (trunc i64 %Step to i32) to i64)
	// The returned pair means that SymbolicPHI can be rewritten into NewAddRec
	// under the predicates {P1,P2,P3}.
	// This predicated rewrite will be cached in PredicatedSCEVRewrites:
	// PredicatedSCEVRewrites[{%X,L}] = {NewAddRec, {P1,P2,P3)}
	//
	// TODO's:
	//
	// 1) Extend the Induction descriptor to also support inductions that involve
	// casts: When needed (namely, when we are called in the context of the
	// vectorizer induction analysis), a Set of cast instructions will be
	// populated by this method, and provided back to isInductionPHI. This is
	// needed to allow the vectorizer to properly record them to be ignored by
	// the cost model and to avoid vectorizing them (otherwise these casts,
	// which are redundant under the runtime overflow checks, will be
	// vectorized, which can be costly).
	//
	// 2) Support additional induction/PHISCEV patterns: We also want to support
	// inductions where the sext-trunc / zext-trunc operations (partly) occur
	// after the induction update operation (the induction increment):
	//
	// (Trunc iy (SExt/ZExt ix (%SymbolicPHI + InvariantAccum) to iy) to ix)
	// which correspond to a phi->add->trunc->sext/zext->phi update chain.
	//
	// (Trunc iy ((SExt/ZExt ix (%SymbolicPhi) to iy) + InvariantAccum) to ix)
	// which correspond to a phi->trunc->add->sext/zext->phi update chain.
	//
	// 3) Outline common code with createAddRecFromPHI to avoid duplication.
	//
	Optional<std::pair<const SCEV , SmallVector<const SCEVPredicate , 3>>>
	ScalarEvolution::createAddRecFromPHIWithCastsImpl(const SCEVUnknown *SymbolicPHI) {
	SmallVector<const SCEVPredicate *, 3> Predicates;

	// *** Part1: Analyze if we have a phi-with-cast pattern for which we can
	// return an AddRec expression under some predicate.

	auto *PN = cast<PHINode>(SymbolicPHI->getValue());
	const Loop *L = isIntegerLoopHeaderPHI(PN, LI);
	assert (L && "Expecting an integer loop header phi");

	// The loop may have multiple entrances or multiple exits; we can analyze
	// this phi as an addrec if it has a unique entry value and a unique
	// backedge value.
	Value BEValueV = nullptr, StartValueV = nullptr;
	for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
	Value *V = PN->getIncomingValue(i);
	if (L->contains(PN->getIncomingBlock(i))) {
	if (!BEValueV) {
	BEValueV = V;
	} else if (BEValueV != V) {
	BEValueV = nullptr;
	break;
	}
	} else if (!StartValueV) {
	StartValueV = V;
	} else if (StartValueV != V) {
	StartValueV = nullptr;
	break;
	}
	}
	if (!BEValueV \|\| !StartValueV)
	return None;

	const SCEV *BEValue = getSCEV(BEValueV);

	// If the value coming around the backedge is an add with the symbolic
	// value we just inserted, possibly with casts that we can ignore under
	// an appropriate runtime guard, then we found a simple induction variable!
	const auto *Add = dyn_cast<SCEVAddExpr>(BEValue);
	if (!Add)
	return None;

	// If there is a single occurrence of the symbolic value, possibly
	// casted, replace it with a recurrence.
	unsigned FoundIndex = Add->getNumOperands();
	Type *TruncTy = nullptr;
	bool Signed;
	for (unsigned i = 0, e = Add->getNumOperands(); i != e; ++i)
	if ((TruncTy =
	isSimpleCastedPHI(Add->getOperand(i), SymbolicPHI, Signed, *this)))
	if (FoundIndex == e) {
	FoundIndex = i;
	break;
	}

	if (FoundIndex == Add->getNumOperands())
	return None;

	// Create an add with everything but the specified operand.
	SmallVector<const SCEV *, 8> Ops;
	for (unsigned i = 0, e = Add->getNumOperands(); i != e; ++i)
	if (i != FoundIndex)
	Ops.push_back(Add->getOperand(i));
	const SCEV *Accum = getAddExpr(Ops);

	// The runtime checks will not be valid if the step amount is
	// varying inside the loop.
	if (!isLoopInvariant(Accum, L))
	return None;


	// *** Part2: Create the predicates

	// Analysis was successful: we have a phi-with-cast pattern for which we
	// can return an AddRec expression under the following predicates:
	//
	// P1: A Wrap predicate that guarantees that Trunc(Start) + i*Trunc(Accum)
	// fits within the truncated type (does not overflow) for i = 0 to n-1.
	// P2: An Equal predicate that guarantees that
	// Start = (Ext ix (Trunc iy (Start) to ix) to iy)
	// P3: An Equal predicate that guarantees that
	// Accum = (Ext ix (Trunc iy (Accum) to ix) to iy)
	//
	// As we next prove, the above predicates guarantee that:
	// Start + iAccum = (Ext ix (Trunc iy ( Start + iAccum ) to ix) to iy)
	//
	//
	// More formally, we want to prove that:
	// Expr(i+1) = Start + (i+1) * Accum
	// = (Ext ix (Trunc iy (Expr(i)) to ix) to iy) + Accum
	//
	// Given that:
	// 1) Expr(0) = Start
	// 2) Expr(1) = Start + Accum
	// = (Ext ix (Trunc iy (Start) to ix) to iy) + Accum :: from P2
	// 3) Induction hypothesis (step i):
	// Expr(i) = (Ext ix (Trunc iy (Expr(i-1)) to ix) to iy) + Accum
	//
	// Proof:
	// Expr(i+1) =
	// = Start + (i+1)*Accum
	// = (Start + i*Accum) + Accum
	// = Expr(i) + Accum
	// = (Ext ix (Trunc iy (Expr(i-1)) to ix) to iy) + Accum + Accum
	// :: from step i
	//
	// = (Ext ix (Trunc iy (Start + (i-1)*Accum) to ix) to iy) + Accum + Accum
	//
	// = (Ext ix (Trunc iy (Start + (i-1)*Accum) to ix) to iy)
	// + (Ext ix (Trunc iy (Accum) to ix) to iy)
	// + Accum :: from P3
	//
	// = (Ext ix (Trunc iy ((Start + (i-1)*Accum) + Accum) to ix) to iy)
	// + Accum :: from P1: Ext(x)+Ext(y)=>Ext(x+y)
	//
	// = (Ext ix (Trunc iy (Start + i*Accum) to ix) to iy) + Accum
	// = (Ext ix (Trunc iy (Expr(i)) to ix) to iy) + Accum
	//
	// By induction, the same applies to all iterations 1<=i<n:
	//

	// Create a truncated addrec for which we will add a no overflow check (P1).
	const SCEV *StartVal = getSCEV(StartValueV);
	const SCEV *PHISCEV =
	getAddRecExpr(getTruncateExpr(StartVal, TruncTy),
	getTruncateExpr(Accum, TruncTy), L, SCEV::FlagAnyWrap);
	const auto *AR = cast<SCEVAddRecExpr>(PHISCEV);

	SCEVWrapPredicate::IncrementWrapFlags AddedFlags =
	Signed ? SCEVWrapPredicate::IncrementNSSW
	: SCEVWrapPredicate::IncrementNUSW;
	const SCEVPredicate *AddRecPred = getWrapPredicate(AR, AddedFlags);
	Predicates.push_back(AddRecPred);

	// Create the Equal Predicates P2,P3:
	auto AppendPredicate = [&](const SCEV *Expr) -> void {
	assert (isLoopInvariant(Expr, L) && "Expr is expected to be invariant");
	const SCEV *TruncatedExpr = getTruncateExpr(Expr, TruncTy);
	const SCEV *ExtendedExpr =
	Signed ? getSignExtendExpr(TruncatedExpr, Expr->getType())
	: getZeroExtendExpr(TruncatedExpr, Expr->getType());
	if (Expr != ExtendedExpr &&
	!isKnownPredicate(ICmpInst::ICMP_EQ, Expr, ExtendedExpr)) {
	const SCEVPredicate *Pred = getEqualPredicate(Expr, ExtendedExpr);
	DEBUG (dbgs() << "Added Predicate: " << *Pred);
	Predicates.push_back(Pred);
	}
	};

	AppendPredicate(StartVal);
	AppendPredicate(Accum);

	// *** Part3: Predicates are ready. Now go ahead and create the new addrec in
	// which the casts had been folded away. The caller can rewrite SymbolicPHI
	// into NewAR if it will also add the runtime overflow checks specified in
	// Predicates.
	auto *NewAR = getAddRecExpr(StartVal, Accum, L, SCEV::FlagAnyWrap);

	std::pair<const SCEV , SmallVector<const SCEVPredicate , 3>> PredRewrite =
	std::make_pair(NewAR, Predicates);
	// Remember the result of the analysis for this SCEV at this locayyytion.
	PredicatedSCEVRewrites[{SymbolicPHI, L}] = PredRewrite;
	return PredRewrite;
	}

	Optional<std::pair<const SCEV , SmallVector<const SCEVPredicate , 3>>>
	ScalarEvolution::createAddRecFromPHIWithCasts(const SCEVUnknown *SymbolicPHI) {

	auto *PN = cast<PHINode>(SymbolicPHI->getValue());
	const Loop *L = isIntegerLoopHeaderPHI(PN, LI);
	if (!L)
	return None;

	// Check to see if we already analyzed this PHI.
	auto I = PredicatedSCEVRewrites.find({SymbolicPHI, L});
	if (I != PredicatedSCEVRewrites.end()) {
	std::pair<const SCEV , SmallVector<const SCEVPredicate , 3>> Rewrite =
	I->second;
	// Analysis was done before and failed to create an AddRec:
	if (Rewrite.first == SymbolicPHI)
	return None;
	// Analysis was done before and succeeded to create an AddRec under
	// a predicate:
	assert(isa<SCEVAddRecExpr>(Rewrite.first) && "Expected an AddRec");
	assert(!(Rewrite.second).empty() && "Expected to find Predicates");
	return Rewrite;
	}

	Optional<std::pair<const SCEV , SmallVector<const SCEVPredicate , 3>>>
	Rewrite = createAddRecFromPHIWithCastsImpl(SymbolicPHI);

	// Record in the cache that the analysis failed
	if (!Rewrite) {
	SmallVector<const SCEVPredicate *, 3> Predicates;
	PredicatedSCEVRewrites[{SymbolicPHI, L}] = {SymbolicPHI, Predicates};
	return None;
	}

	return Rewrite;
	}

	/// A helper function for createAddRecFromPHI to handle simple cases.
	///
	/// This function tries to find an AddRec expression for the simplest (yet most
	/// common) cases: PN = PHI(Start, OP(Self, LoopInvariant)).
	/// If it fails, createAddRecFromPHI will use a more general, but slow,
	/// technique for finding the AddRec expression.
	const SCEV ScalarEvolution::createSimpleAffineAddRec(PHINode PN,
	Value *BEValueV,
	Value *StartValueV) {
	const Loop *L = LI.getLoopFor(PN->getParent());
	assert(L && L->getHeader() == PN->getParent());
	assert(BEValueV && StartValueV);

	auto BO = MatchBinaryOp(BEValueV, DT);
	if (!BO)
	return nullptr;

	if (BO->Opcode != Instruction::Add)
	return nullptr;

	const SCEV *Accum = nullptr;
	if (BO->LHS == PN && L->isLoopInvariant(BO->RHS))
	Accum = getSCEV(BO->RHS);
	else if (BO->RHS == PN && L->isLoopInvariant(BO->LHS))
	Accum = getSCEV(BO->LHS);

	if (!Accum)
	return nullptr;

	SCEV::NoWrapFlags Flags = SCEV::FlagAnyWrap;
	if (BO->IsNUW)
	Flags = setFlags(Flags, SCEV::FlagNUW);
	if (BO->IsNSW)
	Flags = setFlags(Flags, SCEV::FlagNSW);

	const SCEV *StartVal = getSCEV(StartValueV);
	const SCEV *PHISCEV = getAddRecExpr(StartVal, Accum, L, Flags);

	ValueExprMap[SCEVCallbackVH(PN, this)] = PHISCEV;

	// We can add Flags to the post-inc expression only if we
	// know that it is undefined behavior for BEValueV to
	// overflow.
	if (auto *BEInst = dyn_cast<Instruction>(BEValueV))
	if (isLoopInvariant(Accum, L) && isAddRecNeverPoison(BEInst, L))
	(void)getAddRecExpr(getAddExpr(StartVal, Accum), Accum, L, Flags);

	return PHISCEV;
	}

	const SCEV ScalarEvolution::createAddRecFromPHI(PHINode PN) {
	const Loop *L = LI.getLoopFor(PN->getParent());
	if (!L \|\| L->getHeader() != PN->getParent())
	return nullptr;

	// The loop may have multiple entrances or multiple exits; we can analyze
	// this phi as an addrec if it has a unique entry value and a unique
	// backedge value.
	Value BEValueV = nullptr, StartValueV = nullptr;
	for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
	Value *V = PN->getIncomingValue(i);
	if (L->contains(PN->getIncomingBlock(i))) {
	if (!BEValueV) {
	BEValueV = V;
	} else if (BEValueV != V) {
	BEValueV = nullptr;
	break;
	}
	} else if (!StartValueV) {
	StartValueV = V;
	} else if (StartValueV != V) {
	StartValueV = nullptr;
	break;
	}
	}
	if (!BEValueV \|\| !StartValueV)
	return nullptr;

	assert(ValueExprMap.find_as(PN) == ValueExprMap.end() &&
	"PHI node already processed?");

	// First, try to find AddRec expression without creating a fictituos symbolic
	// value for PN.
	if (auto *S = createSimpleAffineAddRec(PN, BEValueV, StartValueV))
	return S;

	// Handle PHI node value symbolically.
	const SCEV *SymbolicName = getUnknown(PN);
	ValueExprMap.insert({SCEVCallbackVH(PN, this), SymbolicName});

	// Using this symbolic name for the PHI, analyze the value coming around
	// the back-edge.
	const SCEV *BEValue = getSCEV(BEValueV);

	// NOTE: If BEValue is loop invariant, we know that the PHI node just
	// has a special value for the first iteration of the loop.

	// If the value coming around the backedge is an add with the symbolic
	// value we just inserted, then we found a simple induction variable!
	if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(BEValue)) {
	// If there is a single occurrence of the symbolic value, replace it
	// with a recurrence.
	unsigned FoundIndex = Add->getNumOperands();
	for (unsigned i = 0, e = Add->getNumOperands(); i != e; ++i)
	if (Add->getOperand(i) == SymbolicName)
	if (FoundIndex == e) {
	FoundIndex = i;
	break;
	}

	if (FoundIndex != Add->getNumOperands()) {
	// Create an add with everything but the specified operand.
	SmallVector<const SCEV *, 8> Ops;
	for (unsigned i = 0, e = Add->getNumOperands(); i != e; ++i)
	if (i != FoundIndex)
	Ops.push_back(Add->getOperand(i));
	const SCEV *Accum = getAddExpr(Ops);

	// This is not a valid addrec if the step amount is varying each
	// loop iteration, but is not itself an addrec in this loop.
	if (isLoopInvariant(Accum, L) \|\|
	(isa<SCEVAddRecExpr>(Accum) &&
	cast<SCEVAddRecExpr>(Accum)->getLoop() == L)) {
	SCEV::NoWrapFlags Flags = SCEV::FlagAnyWrap;

	if (auto BO = MatchBinaryOp(BEValueV, DT)) {
	if (BO->Opcode == Instruction::Add && BO->LHS == PN) {
	if (BO->IsNUW)
	Flags = setFlags(Flags, SCEV::FlagNUW);
	if (BO->IsNSW)
	Flags = setFlags(Flags, SCEV::FlagNSW);
	}
	} else if (GEPOperator *GEP = dyn_cast<GEPOperator>(BEValueV)) {
	// If the increment is an inbounds GEP, then we know the address
	// space cannot be wrapped around. We cannot make any guarantee
	// about signed or unsigned overflow because pointers are
	// unsigned but we may have a negative index from the base
	// pointer. We can guarantee that no unsigned wrap occurs if the
	// indices form a positive value.
	if (GEP->isInBounds() && GEP->getOperand(0) == PN) {
	Flags = setFlags(Flags, SCEV::FlagNW);

	const SCEV *Ptr = getSCEV(GEP->getPointerOperand());
	if (isKnownPositive(getMinusSCEV(getSCEV(GEP), Ptr)))
	Flags = setFlags(Flags, SCEV::FlagNUW);
	}

	// We cannot transfer nuw and nsw flags from subtraction
	// operations -- sub nuw X, Y is not the same as add nuw X, -Y
	// for instance.
	}

	const SCEV *StartVal = getSCEV(StartValueV);
	const SCEV *PHISCEV = getAddRecExpr(StartVal, Accum, L, Flags);

	// Okay, for the entire analysis of this edge we assumed the PHI
	// to be symbolic. We now need to go back and purge all of the
	// entries for the scalars that use the symbolic expression.
	forgetSymbolicName(PN, SymbolicName);
	ValueExprMap[SCEVCallbackVH(PN, this)] = PHISCEV;

	// We can add Flags to the post-inc expression only if we
	// know that it is undefined behavior for BEValueV to
	// overflow.
	if (auto *BEInst = dyn_cast<Instruction>(BEValueV))
	if (isLoopInvariant(Accum, L) && isAddRecNeverPoison(BEInst, L))
	(void)getAddRecExpr(getAddExpr(StartVal, Accum), Accum, L, Flags);

	return PHISCEV;
	}
	}
	} else {
	// Otherwise, this could be a loop like this:
	// i = 0; for (j = 1; ..; ++j) { .... i = j; }
	// In this case, j = {1,+,1} and BEValue is j.
	// Because the other in-value of i (0) fits the evolution of BEValue
	// i really is an addrec evolution.
	//
	// We can generalize this saying that i is the shifted value of BEValue
	// by one iteration:
	// PHI(f(0), f({1,+,1})) --> f({0,+,1})
	const SCEV Shifted = SCEVShiftRewriter::rewrite(BEValue, L, this);
	const SCEV Start = SCEVInitRewriter::rewrite(Shifted, L, this);
	if (Shifted != getCouldNotCompute() &&
	Start != getCouldNotCompute()) {
	const SCEV *StartVal = getSCEV(StartValueV);
	if (Start == StartVal) {
	// Okay, for the entire analysis of this edge we assumed the PHI
	// to be symbolic. We now need to go back and purge all of the
	// entries for the scalars that use the symbolic expression.
	forgetSymbolicName(PN, SymbolicName);
	ValueExprMap[SCEVCallbackVH(PN, this)] = Shifted;
	return Shifted;
	}
	}
	}

	// Remove the temporary PHI node SCEV that has been inserted while intending
	// to create an AddRecExpr for this PHI node. We can not keep this temporary
	// as it will prevent later (possibly simpler) SCEV expressions to be added
	// to the ValueExprMap.
	eraseValueFromMap(PN);

	return nullptr;
	}

	// Checks if the SCEV S is available at BB. S is considered available at BB
	// if S can be materialized at BB without introducing a fault.
	static bool IsAvailableOnEntry(const Loop L, DominatorTree &DT, const SCEV S,
	BasicBlock *BB) {
	struct CheckAvailable {
	bool TraversalDone = false;
	bool Available = true;

	const Loop *L = nullptr; // The loop BB is in (can be nullptr)
	BasicBlock *BB = nullptr;
	DominatorTree &DT;

	CheckAvailable(const Loop L, BasicBlock BB, DominatorTree &DT)
	: L(L), BB(BB), DT(DT) {}

	bool setUnavailable() {
	TraversalDone = true;
	Available = false;
	return false;
	}

	bool follow(const SCEV *S) {
	switch (S->getSCEVType()) {
	case scConstant: case scTruncate: case scZeroExtend: case scSignExtend:
	case scAddExpr: case scMulExpr: case scUMaxExpr: case scSMaxExpr:
	// These expressions are available if their operand(s) is/are.
	return true;

	case scAddRecExpr: {
	// We allow add recurrences that are on the loop BB is in, or some
	// outer loop. This guarantees availability because the value of the
	// add recurrence at BB is simply the "current" value of the induction
	// variable. We can relax this in the future; for instance an add
	// recurrence on a sibling dominating loop is also available at BB.
	const auto *ARLoop = cast<SCEVAddRecExpr>(S)->getLoop();
	if (L && (ARLoop == L \|\| ARLoop->contains(L)))
	return true;

	return setUnavailable();
	}

	case scUnknown: {
	// For SCEVUnknown, we check for simple dominance.
	const auto *SU = cast<SCEVUnknown>(S);
	Value *V = SU->getValue();

	if (isa<Argument>(V))
	return false;

	if (isa<Instruction>(V) && DT.dominates(cast<Instruction>(V), BB))
	return false;

	return setUnavailable();
	}

	case scUDivExpr:
	case scCouldNotCompute:
	// We do not try to smart about these at all.
	return setUnavailable();
	}
	llvm_unreachable("switch should be fully covered!");
	}

	bool isDone() { return TraversalDone; }
	};

	CheckAvailable CA(L, BB, DT);
	SCEVTraversal<CheckAvailable> ST(CA);

	ST.visitAll(S);
	return CA.Available;
	}

	// Try to match a control flow sequence that branches out at BI and merges back
	// at Merge into a "C ? LHS : RHS" select pattern. Return true on a successful
	// match.
	static bool BrPHIToSelect(DominatorTree &DT, BranchInst BI, PHINode Merge,
	Value &C, Value &LHS, Value *&RHS) {
	C = BI->getCondition();

	BasicBlockEdge LeftEdge(BI->getParent(), BI->getSuccessor(0));
	BasicBlockEdge RightEdge(BI->getParent(), BI->getSuccessor(1));

	if (!LeftEdge.isSingleEdge())
	return false;

	assert(RightEdge.isSingleEdge() && "Follows from LeftEdge.isSingleEdge()");

	Use &LeftUse = Merge->getOperandUse(0);
	Use &RightUse = Merge->getOperandUse(1);

	if (DT.dominates(LeftEdge, LeftUse) && DT.dominates(RightEdge, RightUse)) {
	LHS = LeftUse;
	RHS = RightUse;
	return true;
	}

	if (DT.dominates(LeftEdge, RightUse) && DT.dominates(RightEdge, LeftUse)) {
	LHS = RightUse;
	RHS = LeftUse;
	return true;
	}

	return false;
	}

	const SCEV ScalarEvolution::createNodeFromSelectLikePHI(PHINode PN) {
	auto IsReachable =
	[&](BasicBlock *BB) { return DT.isReachableFromEntry(BB); };
	if (PN->getNumIncomingValues() == 2 && all_of(PN->blocks(), IsReachable)) {
	const Loop *L = LI.getLoopFor(PN->getParent());

	// We don't want to break LCSSA, even in a SCEV expression tree.
	for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
	if (LI.getLoopFor(PN->getIncomingBlock(i)) != L)
	return nullptr;

	// Try to match
	//
	// br %cond, label %left, label %right
	// left:
	// br label %merge
	// right:
	// br label %merge
	// merge:
	// V = phi [ %x, %left ], [ %y, %right ]
	//
	// as "select %cond, %x, %y"

	BasicBlock *IDom = DT[PN->getParent()]->getIDom()->getBlock();
	assert(IDom && "At least the entry block should dominate PN");

	auto *BI = dyn_cast<BranchInst>(IDom->getTerminator());
	Value Cond = nullptr, LHS = nullptr, *RHS = nullptr;

	if (BI && BI->isConditional() &&
	BrPHIToSelect(DT, BI, PN, Cond, LHS, RHS) &&
	IsAvailableOnEntry(L, DT, getSCEV(LHS), PN->getParent()) &&
	IsAvailableOnEntry(L, DT, getSCEV(RHS), PN->getParent()))
	return createNodeForSelectOrPHI(PN, Cond, LHS, RHS);
	}

	return nullptr;
	}

	const SCEV ScalarEvolution::createNodeForPHI(PHINode PN) {
	if (const SCEV *S = createAddRecFromPHI(PN))
	return S;

	if (const SCEV *S = createNodeFromSelectLikePHI(PN))
	return S;

	// If the PHI has a single incoming value, follow that value, unless the
	// PHI's incoming blocks are in a different loop, in which case doing so
	// risks breaking LCSSA form. Instcombine would normally zap these, but
	// it doesn't have DominatorTree information, so it may miss cases.
	if (Value *V = SimplifyInstruction(PN, {getDataLayout(), &TLI, &DT, &AC}))
	if (LI.replacementPreservesLCSSAForm(PN, V))
	return getSCEV(V);

	// If it's not a loop phi, we can't handle it yet.
	return getUnknown(PN);
	}

	const SCEV ScalarEvolution::createNodeForSelectOrPHI(Instruction I,
	Value *Cond,
	Value *TrueVal,
	Value *FalseVal) {
	// Handle "constant" branch or select. This can occur for instance when a
	// loop pass transforms an inner loop and moves on to process the outer loop.
	if (auto *CI = dyn_cast<ConstantInt>(Cond))
	return getSCEV(CI->isOne() ? TrueVal : FalseVal);

	// Try to match some simple smax or umax patterns.
	auto *ICI = dyn_cast<ICmpInst>(Cond);
	if (!ICI)
	return getUnknown(I);

	Value *LHS = ICI->getOperand(0);
	Value *RHS = ICI->getOperand(1);

	switch (ICI->getPredicate()) {
	case ICmpInst::ICMP_SLT:
	case ICmpInst::ICMP_SLE:
	std::swap(LHS, RHS);
	LLVM_FALLTHROUGH;
	case ICmpInst::ICMP_SGT:
	case ICmpInst::ICMP_SGE:
	// a >s b ? a+x : b+x -> smax(a, b)+x
	// a >s b ? b+x : a+x -> smin(a, b)+x
	if (getTypeSizeInBits(LHS->getType()) <= getTypeSizeInBits(I->getType())) {
	const SCEV *LS = getNoopOrSignExtend(getSCEV(LHS), I->getType());
	const SCEV *RS = getNoopOrSignExtend(getSCEV(RHS), I->getType());
	const SCEV *LA = getSCEV(TrueVal);
	const SCEV *RA = getSCEV(FalseVal);
	const SCEV *LDiff = getMinusSCEV(LA, LS);
	const SCEV *RDiff = getMinusSCEV(RA, RS);
	if (LDiff == RDiff)
	return getAddExpr(getSMaxExpr(LS, RS), LDiff);
	LDiff = getMinusSCEV(LA, RS);
	RDiff = getMinusSCEV(RA, LS);
	if (LDiff == RDiff)
	return getAddExpr(getSMinExpr(LS, RS), LDiff);
	}
	break;
	case ICmpInst::ICMP_ULT:
	case ICmpInst::ICMP_ULE:
	std::swap(LHS, RHS);
	LLVM_FALLTHROUGH;
	case ICmpInst::ICMP_UGT:
	case ICmpInst::ICMP_UGE:
	// a >u b ? a+x : b+x -> umax(a, b)+x
	// a >u b ? b+x : a+x -> umin(a, b)+x
	if (getTypeSizeInBits(LHS->getType()) <= getTypeSizeInBits(I->getType())) {
	const SCEV *LS = getNoopOrZeroExtend(getSCEV(LHS), I->getType());
	const SCEV *RS = getNoopOrZeroExtend(getSCEV(RHS), I->getType());
	const SCEV *LA = getSCEV(TrueVal);
	const SCEV *RA = getSCEV(FalseVal);
	const SCEV *LDiff = getMinusSCEV(LA, LS);
	const SCEV *RDiff = getMinusSCEV(RA, RS);
	if (LDiff == RDiff)
	return getAddExpr(getUMaxExpr(LS, RS), LDiff);
	LDiff = getMinusSCEV(LA, RS);
	RDiff = getMinusSCEV(RA, LS);
	if (LDiff == RDiff)
	return getAddExpr(getUMinExpr(LS, RS), LDiff);
	}
	break;
	case ICmpInst::ICMP_NE:
	// n != 0 ? n+x : 1+x -> umax(n, 1)+x
	if (getTypeSizeInBits(LHS->getType()) <= getTypeSizeInBits(I->getType()) &&
	isa<ConstantInt>(RHS) && cast<ConstantInt>(RHS)->isZero()) {
	const SCEV *One = getOne(I->getType());
	const SCEV *LS = getNoopOrZeroExtend(getSCEV(LHS), I->getType());
	const SCEV *LA = getSCEV(TrueVal);
	const SCEV *RA = getSCEV(FalseVal);
	const SCEV *LDiff = getMinusSCEV(LA, LS);
	const SCEV *RDiff = getMinusSCEV(RA, One);
	if (LDiff == RDiff)
	return getAddExpr(getUMaxExpr(One, LS), LDiff);
	}
	break;
	case ICmpInst::ICMP_EQ:
	// n == 0 ? 1+x : n+x -> umax(n, 1)+x
	if (getTypeSizeInBits(LHS->getType()) <= getTypeSizeInBits(I->getType()) &&
	isa<ConstantInt>(RHS) && cast<ConstantInt>(RHS)->isZero()) {
	const SCEV *One = getOne(I->getType());
	const SCEV *LS = getNoopOrZeroExtend(getSCEV(LHS), I->getType());
	const SCEV *LA = getSCEV(TrueVal);
	const SCEV *RA = getSCEV(FalseVal);
	const SCEV *LDiff = getMinusSCEV(LA, One);
	const SCEV *RDiff = getMinusSCEV(RA, LS);
	if (LDiff == RDiff)
	return getAddExpr(getUMaxExpr(One, LS), LDiff);
	}
	break;
	default:
	break;
	}

	return getUnknown(I);
	}

	/// Expand GEP instructions into add and multiply operations. This allows them
	/// to be analyzed by regular SCEV code.
	const SCEV ScalarEvolution::createNodeForGEP(GEPOperator GEP) {
	// Don't attempt to analyze GEPs over unsized objects.
	if (!GEP->getSourceElementType()->isSized())
	return getUnknown(GEP);

	SmallVector<const SCEV *, 4> IndexExprs;
	for (auto Index = GEP->idx_begin(); Index != GEP->idx_end(); ++Index)
	IndexExprs.push_back(getSCEV(*Index));
	return getGEPExpr(GEP, IndexExprs);
	}

	uint32_t ScalarEvolution::GetMinTrailingZerosImpl(const SCEV *S) {
	if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S))
	return C->getAPInt().countTrailingZeros();

	if (const SCEVTruncateExpr *T = dyn_cast<SCEVTruncateExpr>(S))
	return std::min(GetMinTrailingZeros(T->getOperand()),
	(uint32_t)getTypeSizeInBits(T->getType()));

	if (const SCEVZeroExtendExpr *E = dyn_cast<SCEVZeroExtendExpr>(S)) {
	uint32_t OpRes = GetMinTrailingZeros(E->getOperand());
	return OpRes == getTypeSizeInBits(E->getOperand()->getType())
	? getTypeSizeInBits(E->getType())
	: OpRes;
	}

	if (const SCEVSignExtendExpr *E = dyn_cast<SCEVSignExtendExpr>(S)) {
	uint32_t OpRes = GetMinTrailingZeros(E->getOperand());
	return OpRes == getTypeSizeInBits(E->getOperand()->getType())
	? getTypeSizeInBits(E->getType())
	: OpRes;
	}

	if (const SCEVAddExpr *A = dyn_cast<SCEVAddExpr>(S)) {
	// The result is the min of all operands results.
	uint32_t MinOpRes = GetMinTrailingZeros(A->getOperand(0));
	for (unsigned i = 1, e = A->getNumOperands(); MinOpRes && i != e; ++i)
	MinOpRes = std::min(MinOpRes, GetMinTrailingZeros(A->getOperand(i)));
	return MinOpRes;
	}

	if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(S)) {
	// The result is the sum of all operands results.
	uint32_t SumOpRes = GetMinTrailingZeros(M->getOperand(0));
	uint32_t BitWidth = getTypeSizeInBits(M->getType());
	for (unsigned i = 1, e = M->getNumOperands();
	SumOpRes != BitWidth && i != e; ++i)
	SumOpRes =
	std::min(SumOpRes + GetMinTrailingZeros(M->getOperand(i)), BitWidth);
	return SumOpRes;
	}

	if (const SCEVAddRecExpr *A = dyn_cast<SCEVAddRecExpr>(S)) {
	// The result is the min of all operands results.
	uint32_t MinOpRes = GetMinTrailingZeros(A->getOperand(0));
	for (unsigned i = 1, e = A->getNumOperands(); MinOpRes && i != e; ++i)
	MinOpRes = std::min(MinOpRes, GetMinTrailingZeros(A->getOperand(i)));
	return MinOpRes;
	}

	if (const SCEVSMaxExpr *M = dyn_cast<SCEVSMaxExpr>(S)) {
	// The result is the min of all operands results.
	uint32_t MinOpRes = GetMinTrailingZeros(M->getOperand(0));
	for (unsigned i = 1, e = M->getNumOperands(); MinOpRes && i != e; ++i)
	MinOpRes = std::min(MinOpRes, GetMinTrailingZeros(M->getOperand(i)));
	return MinOpRes;
	}

	if (const SCEVUMaxExpr *M = dyn_cast<SCEVUMaxExpr>(S)) {
	// The result is the min of all operands results.
	uint32_t MinOpRes = GetMinTrailingZeros(M->getOperand(0));
	for (unsigned i = 1, e = M->getNumOperands(); MinOpRes && i != e; ++i)
	MinOpRes = std::min(MinOpRes, GetMinTrailingZeros(M->getOperand(i)));
	return MinOpRes;
	}

	if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
	// For a SCEVUnknown, ask ValueTracking.
	KnownBits Known = computeKnownBits(U->getValue(), getDataLayout(), 0, &AC, nullptr, &DT);
	return Known.countMinTrailingZeros();
	}

	// SCEVUDivExpr
	return 0;
	}

	uint32_t ScalarEvolution::GetMinTrailingZeros(const SCEV *S) {
	auto I = MinTrailingZerosCache.find(S);
	if (I != MinTrailingZerosCache.end())
	return I->second;

	uint32_t Result = GetMinTrailingZerosImpl(S);
	auto InsertPair = MinTrailingZerosCache.insert({S, Result});
	assert(InsertPair.second && "Should insert a new key");
	return InsertPair.first->second;
	}

	/// Helper method to assign a range to V from metadata present in the IR.
	static Optional<ConstantRange> GetRangeFromMetadata(Value *V) {
	if (Instruction *I = dyn_cast<Instruction>(V))
	if (MDNode *MD = I->getMetadata(LLVMContext::MD_range))
	return getConstantRangeFromMetadata(*MD);

	return None;
	}

	/// Determine the range for a particular SCEV. If SignHint is
	/// HINT_RANGE_UNSIGNED (resp. HINT_RANGE_SIGNED) then getRange prefers ranges
	/// with a "cleaner" unsigned (resp. signed) representation.
	const ConstantRange &
	ScalarEvolution::getRangeRef(const SCEV *S,
	ScalarEvolution::RangeSignHint SignHint) {
	DenseMap<const SCEV *, ConstantRange> &Cache =
	SignHint == ScalarEvolution::HINT_RANGE_UNSIGNED ? UnsignedRanges
	: SignedRanges;

	// See if we've computed this range already.
	DenseMap<const SCEV *, ConstantRange>::iterator I = Cache.find(S);
	if (I != Cache.end())
	return I->second;

	if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S))
	return setRange(C, SignHint, ConstantRange(C->getAPInt()));

	unsigned BitWidth = getTypeSizeInBits(S->getType());
	ConstantRange ConservativeResult(BitWidth, /isFullSet=/true);

	// If the value has known zeros, the maximum value will have those known zeros
	// as well.
	uint32_t TZ = GetMinTrailingZeros(S);
	if (TZ != 0) {
	if (SignHint == ScalarEvolution::HINT_RANGE_UNSIGNED)
	ConservativeResult =
	ConstantRange(APInt::getMinValue(BitWidth),
	APInt::getMaxValue(BitWidth).lshr(TZ).shl(TZ) + 1);
	else
	ConservativeResult = ConstantRange(
	APInt::getSignedMinValue(BitWidth),
	APInt::getSignedMaxValue(BitWidth).ashr(TZ).shl(TZ) + 1);
	}

	if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
	ConstantRange X = getRangeRef(Add->getOperand(0), SignHint);
	for (unsigned i = 1, e = Add->getNumOperands(); i != e; ++i)
	X = X.add(getRangeRef(Add->getOperand(i), SignHint));
	return setRange(Add, SignHint, ConservativeResult.intersectWith(X));
	}

	if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
	ConstantRange X = getRangeRef(Mul->getOperand(0), SignHint);
	for (unsigned i = 1, e = Mul->getNumOperands(); i != e; ++i)
	X = X.multiply(getRangeRef(Mul->getOperand(i), SignHint));
	return setRange(Mul, SignHint, ConservativeResult.intersectWith(X));
	}

	if (const SCEVSMaxExpr *SMax = dyn_cast<SCEVSMaxExpr>(S)) {
	ConstantRange X = getRangeRef(SMax->getOperand(0), SignHint);
	for (unsigned i = 1, e = SMax->getNumOperands(); i != e; ++i)
	X = X.smax(getRangeRef(SMax->getOperand(i), SignHint));
	return setRange(SMax, SignHint, ConservativeResult.intersectWith(X));
	}

	if (const SCEVUMaxExpr *UMax = dyn_cast<SCEVUMaxExpr>(S)) {
	ConstantRange X = getRangeRef(UMax->getOperand(0), SignHint);
	for (unsigned i = 1, e = UMax->getNumOperands(); i != e; ++i)
	X = X.umax(getRangeRef(UMax->getOperand(i), SignHint));
	return setRange(UMax, SignHint, ConservativeResult.intersectWith(X));
	}

	if (const SCEVUDivExpr *UDiv = dyn_cast<SCEVUDivExpr>(S)) {
	ConstantRange X = getRangeRef(UDiv->getLHS(), SignHint);
	ConstantRange Y = getRangeRef(UDiv->getRHS(), SignHint);
	return setRange(UDiv, SignHint,
	ConservativeResult.intersectWith(X.udiv(Y)));
	}

	if (const SCEVZeroExtendExpr *ZExt = dyn_cast<SCEVZeroExtendExpr>(S)) {
	ConstantRange X = getRangeRef(ZExt->getOperand(), SignHint);
	return setRange(ZExt, SignHint,
	ConservativeResult.intersectWith(X.zeroExtend(BitWidth)));
	}

	if (const SCEVSignExtendExpr *SExt = dyn_cast<SCEVSignExtendExpr>(S)) {
	ConstantRange X = getRangeRef(SExt->getOperand(), SignHint);
	return setRange(SExt, SignHint,
	ConservativeResult.intersectWith(X.signExtend(BitWidth)));
	}

	if (const SCEVTruncateExpr *Trunc = dyn_cast<SCEVTruncateExpr>(S)) {
	ConstantRange X = getRangeRef(Trunc->getOperand(), SignHint);
	return setRange(Trunc, SignHint,
	ConservativeResult.intersectWith(X.truncate(BitWidth)));
	}

	if (const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(S)) {
	// If there's no unsigned wrap, the value will never be less than its
	// initial value.
	if (AddRec->hasNoUnsignedWrap())
	if (const SCEVConstant *C = dyn_cast<SCEVConstant>(AddRec->getStart()))
	if (!C->getValue()->isZero())
	ConservativeResult = ConservativeResult.intersectWith(
	ConstantRange(C->getAPInt(), APInt(BitWidth, 0)));

	// If there's no signed wrap, and all the operands have the same sign or
	// zero, the value won't ever change sign.
	if (AddRec->hasNoSignedWrap()) {
	bool AllNonNeg = true;
	bool AllNonPos = true;
	for (unsigned i = 0, e = AddRec->getNumOperands(); i != e; ++i) {
	if (!isKnownNonNegative(AddRec->getOperand(i))) AllNonNeg = false;
	if (!isKnownNonPositive(AddRec->getOperand(i))) AllNonPos = false;
	}
	if (AllNonNeg)
	ConservativeResult = ConservativeResult.intersectWith(
	ConstantRange(APInt(BitWidth, 0),
	APInt::getSignedMinValue(BitWidth)));
	else if (AllNonPos)
	ConservativeResult = ConservativeResult.intersectWith(
	ConstantRange(APInt::getSignedMinValue(BitWidth),
	APInt(BitWidth, 1)));
	}

	// TODO: non-affine addrec
	if (AddRec->isAffine()) {
	const SCEV *MaxBECount = getMaxBackedgeTakenCount(AddRec->getLoop());
	if (!isa<SCEVCouldNotCompute>(MaxBECount) &&
	getTypeSizeInBits(MaxBECount->getType()) <= BitWidth) {
	auto RangeFromAffine = getRangeForAffineAR(
	AddRec->getStart(), AddRec->getStepRecurrence(*this), MaxBECount,
	BitWidth);
	if (!RangeFromAffine.isFullSet())
	ConservativeResult =
	ConservativeResult.intersectWith(RangeFromAffine);

	auto RangeFromFactoring = getRangeViaFactoring(
	AddRec->getStart(), AddRec->getStepRecurrence(*this), MaxBECount,
	BitWidth);
	if (!RangeFromFactoring.isFullSet())
	ConservativeResult =
	ConservativeResult.intersectWith(RangeFromFactoring);
	}
	}

	return setRange(AddRec, SignHint, std::move(ConservativeResult));
	}

	if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
	// Check if the IR explicitly contains !range metadata.
	Optional<ConstantRange> MDRange = GetRangeFromMetadata(U->getValue());
	if (MDRange.hasValue())
	ConservativeResult = ConservativeResult.intersectWith(MDRange.getValue());

	// Split here to avoid paying the compile-time cost of calling both
	// computeKnownBits and ComputeNumSignBits. This restriction can be lifted
	// if needed.
	const DataLayout &DL = getDataLayout();
	if (SignHint == ScalarEvolution::HINT_RANGE_UNSIGNED) {
	// For a SCEVUnknown, ask ValueTracking.
	KnownBits Known = computeKnownBits(U->getValue(), DL, 0, &AC, nullptr, &DT);
	if (Known.One != ~Known.Zero + 1)
	ConservativeResult =
	ConservativeResult.intersectWith(ConstantRange(Known.One,
	~Known.Zero + 1));
	} else {
	assert(SignHint == ScalarEvolution::HINT_RANGE_SIGNED &&
	"generalize as needed!");
	unsigned NS = ComputeNumSignBits(U->getValue(), DL, 0, &AC, nullptr, &DT);
	if (NS > 1)
	ConservativeResult = ConservativeResult.intersectWith(
	ConstantRange(APInt::getSignedMinValue(BitWidth).ashr(NS - 1),
	APInt::getSignedMaxValue(BitWidth).ashr(NS - 1) + 1));
	}

	return setRange(U, SignHint, std::move(ConservativeResult));
	}

	return setRange(S, SignHint, std::move(ConservativeResult));
	}

	// Given a StartRange, Step and MaxBECount for an expression compute a range of
	// values that the expression can take. Initially, the expression has a value
	// from StartRange and then is changed by Step up to MaxBECount times. Signed
	// argument defines if we treat Step as signed or unsigned.
	static ConstantRange getRangeForAffineARHelper(APInt Step,
	const ConstantRange &StartRange,
	const APInt &MaxBECount,
	unsigned BitWidth, bool Signed) {
	// If either Step or MaxBECount is 0, then the expression won't change, and we
	// just need to return the initial range.
	if (Step == 0 \|\| MaxBECount == 0)
	return StartRange;

	// If we don't know anything about the initial value (i.e. StartRange is
	// FullRange), then we don't know anything about the final range either.
	// Return FullRange.
	if (StartRange.isFullSet())
	return ConstantRange(BitWidth, /* isFullSet = */ true);

	// If Step is signed and negative, then we use its absolute value, but we also
	// note that we're moving in the opposite direction.
	bool Descending = Signed && Step.isNegative();

	if (Signed)
	// This is correct even for INT_SMIN. Let's look at i8 to illustrate this:
	// abs(INT_SMIN) = abs(-128) = abs(0x80) = -0x80 = 0x80 = 128.
	// This equations hold true due to the well-defined wrap-around behavior of
	// APInt.
	Step = Step.abs();

	// Check if Offset is more than full span of BitWidth. If it is, the
	// expression is guaranteed to overflow.
	if (APInt::getMaxValue(StartRange.getBitWidth()).udiv(Step).ult(MaxBECount))
	return ConstantRange(BitWidth, /* isFullSet = */ true);

	// Offset is by how much the expression can change. Checks above guarantee no
	// overflow here.
	APInt Offset = Step * MaxBECount;

	// Minimum value of the final range will match the minimal value of StartRange
	// if the expression is increasing and will be decreased by Offset otherwise.
	// Maximum value of the final range will match the maximal value of StartRange
	// if the expression is decreasing and will be increased by Offset otherwise.
	APInt StartLower = StartRange.getLower();
	APInt StartUpper = StartRange.getUpper() - 1;
	APInt MovedBoundary = Descending ? (StartLower - std::move(Offset))
	: (StartUpper + std::move(Offset));

	// It's possible that the new minimum/maximum value will fall into the initial
	// range (due to wrap around). This means that the expression can take any
	// value in this bitwidth, and we have to return full range.
	if (StartRange.contains(MovedBoundary))
	return ConstantRange(BitWidth, /* isFullSet = */ true);

	APInt NewLower =
	Descending ? std::move(MovedBoundary) : std::move(StartLower);
	APInt NewUpper =
	Descending ? std::move(StartUpper) : std::move(MovedBoundary);
	NewUpper += 1;

	// If we end up with full range, return a proper full range.
	if (NewLower == NewUpper)
	return ConstantRange(BitWidth, /* isFullSet = */ true);

	// No overflow detected, return [StartLower, StartUpper + Offset + 1) range.
	return ConstantRange(std::move(NewLower), std::move(NewUpper));
	}

	ConstantRange ScalarEvolution::getRangeForAffineAR(const SCEV *Start,
	const SCEV *Step,
	const SCEV *MaxBECount,
	unsigned BitWidth) {
	assert(!isa<SCEVCouldNotCompute>(MaxBECount) &&
	getTypeSizeInBits(MaxBECount->getType()) <= BitWidth &&
	"Precondition!");

	MaxBECount = getNoopOrZeroExtend(MaxBECount, Start->getType());
	APInt MaxBECountValue = getUnsignedRangeMax(MaxBECount);

	// First, consider step signed.
	ConstantRange StartSRange = getSignedRange(Start);
	ConstantRange StepSRange = getSignedRange(Step);

	// If Step can be both positive and negative, we need to find ranges for the
	// maximum absolute step values in both directions and union them.
	ConstantRange SR =
	getRangeForAffineARHelper(StepSRange.getSignedMin(), StartSRange,
	MaxBECountValue, BitWidth, /* Signed = */ true);
	SR = SR.unionWith(getRangeForAffineARHelper(StepSRange.getSignedMax(),
	StartSRange, MaxBECountValue,
	BitWidth, /* Signed = */ true));

	// Next, consider step unsigned.
	ConstantRange UR = getRangeForAffineARHelper(
	getUnsignedRangeMax(Step), getUnsignedRange(Start),
	MaxBECountValue, BitWidth, /* Signed = */ false);

	// Finally, intersect signed and unsigned ranges.
	return SR.intersectWith(UR);
	}

	ConstantRange ScalarEvolution::getRangeViaFactoring(const SCEV *Start,
	const SCEV *Step,
	const SCEV *MaxBECount,
	unsigned BitWidth) {
	// RangeOf({C?A:B,+,C?P:Q}) == RangeOf(C?{A,+,P}:{B,+,Q})
	// == RangeOf({A,+,P}) union RangeOf({B,+,Q})

	struct SelectPattern {
	Value *Condition = nullptr;
	APInt TrueValue;
	APInt FalseValue;

	explicit SelectPattern(ScalarEvolution &SE, unsigned BitWidth,
	const SCEV *S) {
	Optional<unsigned> CastOp;
	APInt Offset(BitWidth, 0);

	assert(SE.getTypeSizeInBits(S->getType()) == BitWidth &&
	"Should be!");

	// Peel off a constant offset:
	if (auto *SA = dyn_cast<SCEVAddExpr>(S)) {
	// In the future we could consider being smarter here and handle
	// {Start+Step,+,Step} too.
	if (SA->getNumOperands() != 2 \|\| !isa<SCEVConstant>(SA->getOperand(0)))
	return;

	Offset = cast<SCEVConstant>(SA->getOperand(0))->getAPInt();
	S = SA->getOperand(1);
	}

	// Peel off a cast operation
	if (auto *SCast = dyn_cast<SCEVCastExpr>(S)) {
	CastOp = SCast->getSCEVType();
	S = SCast->getOperand();
	}

	using namespace llvm::PatternMatch;

	auto *SU = dyn_cast<SCEVUnknown>(S);
	const APInt TrueVal, FalseVal;
	if (!SU \|\|
	!match(SU->getValue(), m_Select(m_Value(Condition), m_APInt(TrueVal),
	m_APInt(FalseVal)))) {
	Condition = nullptr;
	return;
	}

	TrueValue = *TrueVal;
	FalseValue = *FalseVal;

	// Re-apply the cast we peeled off earlier
	if (CastOp.hasValue())
	switch (*CastOp) {
	default:
	llvm_unreachable("Unknown SCEV cast type!");

	case scTruncate:
	TrueValue = TrueValue.trunc(BitWidth);
	FalseValue = FalseValue.trunc(BitWidth);
	break;
	case scZeroExtend:
	TrueValue = TrueValue.zext(BitWidth);
	FalseValue = FalseValue.zext(BitWidth);
	break;
	case scSignExtend:
	TrueValue = TrueValue.sext(BitWidth);
	FalseValue = FalseValue.sext(BitWidth);
	break;
	}

	// Re-apply the constant offset we peeled off earlier
	TrueValue += Offset;
	FalseValue += Offset;
	}

	bool isRecognized() { return Condition != nullptr; }
	};

	SelectPattern StartPattern(*this, BitWidth, Start);
	if (!StartPattern.isRecognized())
	return ConstantRange(BitWidth, /* isFullSet = */ true);

	SelectPattern StepPattern(*this, BitWidth, Step);
	if (!StepPattern.isRecognized())
	return ConstantRange(BitWidth, /* isFullSet = */ true);

	if (StartPattern.Condition != StepPattern.Condition) {
	// We don't handle this case today; but we could, by considering four
	// possibilities below instead of two. I'm not sure if there are cases where
	// that will help over what getRange already does, though.
	return ConstantRange(BitWidth, /* isFullSet = */ true);
	}

	// NB! Calling ScalarEvolution::getConstant is fine, but we should not try to
	// construct arbitrary general SCEV expressions here. This function is called
	// from deep in the call stack, and calling getSCEV (on a sext instruction,
	// say) can end up caching a suboptimal value.

	// FIXME: without the explicit `this` receiver below, MSVC errors out with
	// C2352 and C2512 (otherwise it isn't needed).

	const SCEV *TrueStart = this->getConstant(StartPattern.TrueValue);
	const SCEV *TrueStep = this->getConstant(StepPattern.TrueValue);
	const SCEV *FalseStart = this->getConstant(StartPattern.FalseValue);
	const SCEV *FalseStep = this->getConstant(StepPattern.FalseValue);

	ConstantRange TrueRange =
	this->getRangeForAffineAR(TrueStart, TrueStep, MaxBECount, BitWidth);
	ConstantRange FalseRange =
	this->getRangeForAffineAR(FalseStart, FalseStep, MaxBECount, BitWidth);

	return TrueRange.unionWith(FalseRange);
	}

	SCEV::NoWrapFlags ScalarEvolution::getNoWrapFlagsFromUB(const Value *V) {
	if (isa<ConstantExpr>(V)) return SCEV::FlagAnyWrap;
	const BinaryOperator *BinOp = cast<BinaryOperator>(V);

	// Return early if there are no flags to propagate to the SCEV.
	SCEV::NoWrapFlags Flags = SCEV::FlagAnyWrap;
	if (BinOp->hasNoUnsignedWrap())
	Flags = ScalarEvolution::setFlags(Flags, SCEV::FlagNUW);
	if (BinOp->hasNoSignedWrap())
	Flags = ScalarEvolution::setFlags(Flags, SCEV::FlagNSW);
	if (Flags == SCEV::FlagAnyWrap)
	return SCEV::FlagAnyWrap;

	return isSCEVExprNeverPoison(BinOp) ? Flags : SCEV::FlagAnyWrap;
	}

	bool ScalarEvolution::isSCEVExprNeverPoison(const Instruction *I) {
	// Here we check that I is in the header of the innermost loop containing I,
	// since we only deal with instructions in the loop header. The actual loop we
	// need to check later will come from an add recurrence, but getting that
	// requires computing the SCEV of the operands, which can be expensive. This
	// check we can do cheaply to rule out some cases early.
	Loop *InnermostContainingLoop = LI.getLoopFor(I->getParent());
	if (InnermostContainingLoop == nullptr \|\|
	InnermostContainingLoop->getHeader() != I->getParent())
	return false;

	// Only proceed if we can prove that I does not yield poison.
	if (!programUndefinedIfFullPoison(I))
	return false;

	// At this point we know that if I is executed, then it does not wrap
	// according to at least one of NSW or NUW. If I is not executed, then we do
	// not know if the calculation that I represents would wrap. Multiple
	// instructions can map to the same SCEV. If we apply NSW or NUW from I to
	// the SCEV, we must guarantee no wrapping for that SCEV also when it is
	// derived from other instructions that map to the same SCEV. We cannot make
	// that guarantee for cases where I is not executed. So we need to find the
	// loop that I is considered in relation to and prove that I is executed for
	// every iteration of that loop. That implies that the value that I
	// calculates does not wrap anywhere in the loop, so then we can apply the
	// flags to the SCEV.
	//
	// We check isLoopInvariant to disambiguate in case we are adding recurrences
	// from different loops, so that we know which loop to prove that I is
	// executed in.
	for (unsigned OpIndex = 0; OpIndex < I->getNumOperands(); ++OpIndex) {
	// I could be an extractvalue from a call to an overflow intrinsic.
	// TODO: We can do better here in some cases.
	if (!isSCEVable(I->getOperand(OpIndex)->getType()))
	return false;
	const SCEV *Op = getSCEV(I->getOperand(OpIndex));
	if (auto *AddRec = dyn_cast<SCEVAddRecExpr>(Op)) {
	bool AllOtherOpsLoopInvariant = true;
	for (unsigned OtherOpIndex = 0; OtherOpIndex < I->getNumOperands();
	++OtherOpIndex) {
	if (OtherOpIndex != OpIndex) {
	const SCEV *OtherOp = getSCEV(I->getOperand(OtherOpIndex));
	if (!isLoopInvariant(OtherOp, AddRec->getLoop())) {
	AllOtherOpsLoopInvariant = false;
	break;
	}
	}
	}
	if (AllOtherOpsLoopInvariant &&
	isGuaranteedToExecuteForEveryIteration(I, AddRec->getLoop()))
	return true;
	}
	}
	return false;
	}

	bool ScalarEvolution::isAddRecNeverPoison(const Instruction I, const Loop L) {
	// If we know that \c I can never be poison period, then that's enough.
	if (isSCEVExprNeverPoison(I))
	return true;

	// For an add recurrence specifically, we assume that infinite loops without
	// side effects are undefined behavior, and then reason as follows:
	//
	// If the add recurrence is poison in any iteration, it is poison on all
	// future iterations (since incrementing poison yields poison). If the result
	// of the add recurrence is fed into the loop latch condition and the loop
	// does not contain any throws or exiting blocks other than the latch, we now
	// have the ability to "choose" whether the backedge is taken or not (by
	// choosing a sufficiently evil value for the poison feeding into the branch)
	// for every iteration including and after the one in which \p I first became
	// poison. There are two possibilities (let's call the iteration in which \p
	// I first became poison as K):
	//
	// 1. In the set of iterations including and after K, the loop body executes
	// no side effects. In this case executing the backege an infinte number
	// of times will yield undefined behavior.
	//
	// 2. In the set of iterations including and after K, the loop body executes
	// at least one side effect. In this case, that specific instance of side
	// effect is control dependent on poison, which also yields undefined
	// behavior.

	auto *ExitingBB = L->getExitingBlock();
	auto *LatchBB = L->getLoopLatch();
	if (!ExitingBB \|\| !LatchBB \|\| ExitingBB != LatchBB)
	return false;

	SmallPtrSet<const Instruction *, 16> Pushed;
	SmallVector<const Instruction *, 8> PoisonStack;

	// We start by assuming \c I, the post-inc add recurrence, is poison. Only
	// things that are known to be fully poison under that assumption go on the
	// PoisonStack.
	Pushed.insert(I);
	PoisonStack.push_back(I);

	bool LatchControlDependentOnPoison = false;
	while (!PoisonStack.empty() && !LatchControlDependentOnPoison) {
	const Instruction *Poison = PoisonStack.pop_back_val();

	for (auto *PoisonUser : Poison->users()) {
	if (propagatesFullPoison(cast<Instruction>(PoisonUser))) {
	if (Pushed.insert(cast<Instruction>(PoisonUser)).second)
	PoisonStack.push_back(cast<Instruction>(PoisonUser));
	} else if (auto *BI = dyn_cast<BranchInst>(PoisonUser)) {
	assert(BI->isConditional() && "Only possibility!");
	if (BI->getParent() == LatchBB) {
	LatchControlDependentOnPoison = true;
	break;
	}
	}
	}
	}

	return LatchControlDependentOnPoison && loopHasNoAbnormalExits(L);
	}

	ScalarEvolution::LoopProperties
	ScalarEvolution::getLoopProperties(const Loop *L) {
	typedef ScalarEvolution::LoopProperties LoopProperties;

	auto Itr = LoopPropertiesCache.find(L);
	if (Itr == LoopPropertiesCache.end()) {
	auto HasSideEffects = [](Instruction *I) {
	if (auto *SI = dyn_cast<StoreInst>(I))
	return !SI->isSimple();

	return I->mayHaveSideEffects();
	};

	LoopProperties LP = {/* HasNoAbnormalExits */ true,
	/HasNoSideEffects/ true};

	for (auto *BB : L->getBlocks())
	for (auto &I : *BB) {
	if (!isGuaranteedToTransferExecutionToSuccessor(&I))
	LP.HasNoAbnormalExits = false;
	if (HasSideEffects(&I))
	LP.HasNoSideEffects = false;
	if (!LP.HasNoAbnormalExits && !LP.HasNoSideEffects)
	break; // We're already as pessimistic as we can get.
	}

	auto InsertPair = LoopPropertiesCache.insert({L, LP});
	assert(InsertPair.second && "We just checked!");
	Itr = InsertPair.first;
	}

	return Itr->second;
	}

	const SCEV ScalarEvolution::createSCEV(Value V) {
	if (!isSCEVable(V->getType()))
	return getUnknown(V);

	if (Instruction *I = dyn_cast<Instruction>(V)) {
	// Don't attempt to analyze instructions in blocks that aren't
	// reachable. Such instructions don't matter, and they aren't required
	// to obey basic rules for definitions dominating uses which this
	// analysis depends on.
	if (!DT.isReachableFromEntry(I->getParent()))
	return getUnknown(V);
	} else if (ConstantInt *CI = dyn_cast<ConstantInt>(V))
	return getConstant(CI);
	else if (isa<ConstantPointerNull>(V))
	return getZero(V->getType());
	else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V))
	return GA->isInterposable() ? getUnknown(V) : getSCEV(GA->getAliasee());
	else if (!isa<ConstantExpr>(V))
	return getUnknown(V);

	Operator *U = cast<Operator>(V);
	if (auto BO = MatchBinaryOp(U, DT)) {
	switch (BO->Opcode) {
	case Instruction::Add: {
	// The simple thing to do would be to just call getSCEV on both operands
	// and call getAddExpr with the result. However if we're looking at a
	// bunch of things all added together, this can be quite inefficient,
	// because it leads to N-1 getAddExpr calls for N ultimate operands.
	// Instead, gather up all the operands and make a single getAddExpr call.
	// LLVM IR canonical form means we need only traverse the left operands.
	SmallVector<const SCEV *, 4> AddOps;
	do {
	if (BO->Op) {
	if (auto *OpSCEV = getExistingSCEV(BO->Op)) {
	AddOps.push_back(OpSCEV);
	break;
	}

	// If a NUW or NSW flag can be applied to the SCEV for this
	// addition, then compute the SCEV for this addition by itself
	// with a separate call to getAddExpr. We need to do that
	// instead of pushing the operands of the addition onto AddOps,
	// since the flags are only known to apply to this particular
	// addition - they may not apply to other additions that can be
	// formed with operands from AddOps.
	const SCEV *RHS = getSCEV(BO->RHS);
	SCEV::NoWrapFlags Flags = getNoWrapFlagsFromUB(BO->Op);
	if (Flags != SCEV::FlagAnyWrap) {
	const SCEV *LHS = getSCEV(BO->LHS);
	if (BO->Opcode == Instruction::Sub)
	AddOps.push_back(getMinusSCEV(LHS, RHS, Flags));
	else
	AddOps.push_back(getAddExpr(LHS, RHS, Flags));
	break;
	}
	}

	if (BO->Opcode == Instruction::Sub)
	AddOps.push_back(getNegativeSCEV(getSCEV(BO->RHS)));
	else
	AddOps.push_back(getSCEV(BO->RHS));

	auto NewBO = MatchBinaryOp(BO->LHS, DT);
	if (!NewBO \|\| (NewBO->Opcode != Instruction::Add &&
	NewBO->Opcode != Instruction::Sub)) {
	AddOps.push_back(getSCEV(BO->LHS));
	break;
	}
	BO = NewBO;
	} while (true);

	return getAddExpr(AddOps);
	}

	case Instruction::Mul: {
	SmallVector<const SCEV *, 4> MulOps;
	do {
	if (BO->Op) {
	if (auto *OpSCEV = getExistingSCEV(BO->Op)) {
	MulOps.push_back(OpSCEV);
	break;
	}

	SCEV::NoWrapFlags Flags = getNoWrapFlagsFromUB(BO->Op);
	if (Flags != SCEV::FlagAnyWrap) {
	MulOps.push_back(
	getMulExpr(getSCEV(BO->LHS), getSCEV(BO->RHS), Flags));
	break;
	}
	}

	MulOps.push_back(getSCEV(BO->RHS));
	auto NewBO = MatchBinaryOp(BO->LHS, DT);
	if (!NewBO \|\| NewBO->Opcode != Instruction::Mul) {
	MulOps.push_back(getSCEV(BO->LHS));
	break;
	}
	BO = NewBO;
	} while (true);

	return getMulExpr(MulOps);
	}
	case Instruction::UDiv:
	return getUDivExpr(getSCEV(BO->LHS), getSCEV(BO->RHS));
	case Instruction::Sub: {
	SCEV::NoWrapFlags Flags = SCEV::FlagAnyWrap;
	if (BO->Op)
	Flags = getNoWrapFlagsFromUB(BO->Op);
	return getMinusSCEV(getSCEV(BO->LHS), getSCEV(BO->RHS), Flags);
	}
	case Instruction::And:
	// For an expression like x&255 that merely masks off the high bits,
	// use zext(trunc(x)) as the SCEV expression.
	if (ConstantInt *CI = dyn_cast<ConstantInt>(BO->RHS)) {
	if (CI->isZero())
	return getSCEV(BO->RHS);
	if (CI->isMinusOne())
	return getSCEV(BO->LHS);
	const APInt &A = CI->getValue();

	// Instcombine's ShrinkDemandedConstant may strip bits out of
	// constants, obscuring what would otherwise be a low-bits mask.
	// Use computeKnownBits to compute what ShrinkDemandedConstant
	// knew about to reconstruct a low-bits mask value.
	unsigned LZ = A.countLeadingZeros();
	unsigned TZ = A.countTrailingZeros();
	unsigned BitWidth = A.getBitWidth();
	KnownBits Known(BitWidth);
	computeKnownBits(BO->LHS, Known, getDataLayout(),
	0, &AC, nullptr, &DT);

	APInt EffectiveMask =
	APInt::getLowBitsSet(BitWidth, BitWidth - LZ - TZ).shl(TZ);
	if ((LZ != 0 \|\| TZ != 0) && !((~A & ~Known.Zero) & EffectiveMask)) {
	const SCEV *MulCount = getConstant(APInt::getOneBitSet(BitWidth, TZ));
	const SCEV *LHS = getSCEV(BO->LHS);
	const SCEV *ShiftedLHS = nullptr;
	if (auto *LHSMul = dyn_cast<SCEVMulExpr>(LHS)) {
	if (auto *OpC = dyn_cast<SCEVConstant>(LHSMul->getOperand(0))) {
	// For an expression like (x * 8) & 8, simplify the multiply.
	unsigned MulZeros = OpC->getAPInt().countTrailingZeros();
	unsigned GCD = std::min(MulZeros, TZ);
	APInt DivAmt = APInt::getOneBitSet(BitWidth, TZ - GCD);
	SmallVector<const SCEV*, 4> MulOps;
	MulOps.push_back(getConstant(OpC->getAPInt().lshr(GCD)));
	MulOps.append(LHSMul->op_begin() + 1, LHSMul->op_end());
	auto *NewMul = getMulExpr(MulOps, LHSMul->getNoWrapFlags());
	ShiftedLHS = getUDivExpr(NewMul, getConstant(DivAmt));
	}
	}
	if (!ShiftedLHS)
	ShiftedLHS = getUDivExpr(LHS, MulCount);
	return getMulExpr(
	getZeroExtendExpr(
	getTruncateExpr(ShiftedLHS,
	IntegerType::get(getContext(), BitWidth - LZ - TZ)),
	BO->LHS->getType()),
	MulCount);
	}
	}
	break;

	case Instruction::Or:
	// If the RHS of the Or is a constant, we may have something like:
	// X4+1 which got turned into X4\|1. Handle this as an Add so loop
	// optimizations will transparently handle this case.
	//
	// In order for this transformation to be safe, the LHS must be of the
	// form X*(2^n) and the Or constant must be less than 2^n.
	if (ConstantInt *CI = dyn_cast<ConstantInt>(BO->RHS)) {
	const SCEV *LHS = getSCEV(BO->LHS);
	const APInt &CIVal = CI->getValue();
	if (GetMinTrailingZeros(LHS) >=
	(CIVal.getBitWidth() - CIVal.countLeadingZeros())) {
	// Build a plain add SCEV.
	const SCEV *S = getAddExpr(LHS, getSCEV(CI));
	// If the LHS of the add was an addrec and it has no-wrap flags,
	// transfer the no-wrap flags, since an or won't introduce a wrap.
	if (const SCEVAddRecExpr *NewAR = dyn_cast<SCEVAddRecExpr>(S)) {
	const SCEVAddRecExpr *OldAR = cast<SCEVAddRecExpr>(LHS);
	const_cast<SCEVAddRecExpr *>(NewAR)->setNoWrapFlags(
	OldAR->getNoWrapFlags());
	}
	return S;
	}
	}
	break;

	case Instruction::Xor:
	if (ConstantInt *CI = dyn_cast<ConstantInt>(BO->RHS)) {
	// If the RHS of xor is -1, then this is a not operation.
	if (CI->isMinusOne())
	return getNotSCEV(getSCEV(BO->LHS));

	// Model xor(and(x, C), C) as and(~x, C), if C is a low-bits mask.
	// This is a variant of the check for xor with -1, and it handles
	// the case where instcombine has trimmed non-demanded bits out
	// of an xor with -1.
	if (auto *LBO = dyn_cast<BinaryOperator>(BO->LHS))
	if (ConstantInt *LCI = dyn_cast<ConstantInt>(LBO->getOperand(1)))
	if (LBO->getOpcode() == Instruction::And &&
	LCI->getValue() == CI->getValue())
	if (const SCEVZeroExtendExpr *Z =
	dyn_cast<SCEVZeroExtendExpr>(getSCEV(BO->LHS))) {
	Type *UTy = BO->LHS->getType();
	const SCEV *Z0 = Z->getOperand();
	Type *Z0Ty = Z0->getType();
	unsigned Z0TySize = getTypeSizeInBits(Z0Ty);

	// If C is a low-bits mask, the zero extend is serving to
	// mask off the high bits. Complement the operand and
	// re-apply the zext.
	if (CI->getValue().isMask(Z0TySize))
	return getZeroExtendExpr(getNotSCEV(Z0), UTy);

	// If C is a single bit, it may be in the sign-bit position
	// before the zero-extend. In this case, represent the xor
	// using an add, which is equivalent, and re-apply the zext.
	APInt Trunc = CI->getValue().trunc(Z0TySize);
	if (Trunc.zext(getTypeSizeInBits(UTy)) == CI->getValue() &&
	Trunc.isSignMask())
	return getZeroExtendExpr(getAddExpr(Z0, getConstant(Trunc)),
	UTy);
	}
	}
	break;

	case Instruction::Shl:
	// Turn shift left of a constant amount into a multiply.
	if (ConstantInt *SA = dyn_cast<ConstantInt>(BO->RHS)) {
	uint32_t BitWidth = cast<IntegerType>(SA->getType())->getBitWidth();

	// If the shift count is not less than the bitwidth, the result of
	// the shift is undefined. Don't try to analyze it, because the
	// resolution chosen here may differ from the resolution chosen in
	// other parts of the compiler.
	if (SA->getValue().uge(BitWidth))
	break;

	// It is currently not resolved how to interpret NSW for left
	// shift by BitWidth - 1, so we avoid applying flags in that
	// case. Remove this check (or this comment) once the situation
	// is resolved. See
	// http://lists.llvm.org/pipermail/llvm-dev/2015-April/084195.html
	// and http://reviews.llvm.org/D8890 .
	auto Flags = SCEV::FlagAnyWrap;
	if (BO->Op && SA->getValue().ult(BitWidth - 1))
	Flags = getNoWrapFlagsFromUB(BO->Op);

	Constant *X = ConstantInt::get(getContext(),
	APInt::getOneBitSet(BitWidth, SA->getZExtValue()));
	return getMulExpr(getSCEV(BO->LHS), getSCEV(X), Flags);
	}
	break;

	case Instruction::AShr:
	// AShr X, C, where C is a constant.
	ConstantInt *CI = dyn_cast<ConstantInt>(BO->RHS);
	if (!CI)
	break;

	Type *OuterTy = BO->LHS->getType();
	uint64_t BitWidth = getTypeSizeInBits(OuterTy);
	// If the shift count is not less than the bitwidth, the result of
	// the shift is undefined. Don't try to analyze it, because the
	// resolution chosen here may differ from the resolution chosen in
	// other parts of the compiler.
	if (CI->getValue().uge(BitWidth))
	break;

	if (CI->isZero())
	return getSCEV(BO->LHS); // shift by zero --> noop

	uint64_t AShrAmt = CI->getZExtValue();
	Type *TruncTy = IntegerType::get(getContext(), BitWidth - AShrAmt);

	Operator *L = dyn_cast<Operator>(BO->LHS);
	if (L && L->getOpcode() == Instruction::Shl) {
	// X = Shl A, n
	// Y = AShr X, m
	// Both n and m are constant.

	const SCEV *ShlOp0SCEV = getSCEV(L->getOperand(0));
	if (L->getOperand(1) == BO->RHS)
	// For a two-shift sext-inreg, i.e. n = m,
	// use sext(trunc(x)) as the SCEV expression.
	return getSignExtendExpr(
	getTruncateExpr(ShlOp0SCEV, TruncTy), OuterTy);

	ConstantInt *ShlAmtCI = dyn_cast<ConstantInt>(L->getOperand(1));
	if (ShlAmtCI && ShlAmtCI->getValue().ult(BitWidth)) {
	uint64_t ShlAmt = ShlAmtCI->getZExtValue();
	if (ShlAmt > AShrAmt) {
	// When n > m, use sext(mul(trunc(x), 2^(n-m)))) as the SCEV
	// expression. We already checked that ShlAmt < BitWidth, so
	// the multiplier, 1 << (ShlAmt - AShrAmt), fits into TruncTy as
	// ShlAmt - AShrAmt < Amt.
	APInt Mul = APInt::getOneBitSet(BitWidth - AShrAmt,
	ShlAmt - AShrAmt);
	return getSignExtendExpr(
	getMulExpr(getTruncateExpr(ShlOp0SCEV, TruncTy),
	getConstant(Mul)), OuterTy);
	}
	}
	}
	break;
	}
	}

	switch (U->getOpcode()) {
	case Instruction::Trunc:
	return getTruncateExpr(getSCEV(U->getOperand(0)), U->getType());

	case Instruction::ZExt:
	return getZeroExtendExpr(getSCEV(U->getOperand(0)), U->getType());

	case Instruction::SExt:
	return getSignExtendExpr(getSCEV(U->getOperand(0)), U->getType());

	case Instruction::BitCast:
	// BitCasts are no-op casts so we just eliminate the cast.
	if (isSCEVable(U->getType()) && isSCEVable(U->getOperand(0)->getType()))
	return getSCEV(U->getOperand(0));
	break;

	// It's tempting to handle inttoptr and ptrtoint as no-ops, however this can
	// lead to pointer expressions which cannot safely be expanded to GEPs,
	// because ScalarEvolution doesn't respect the GEP aliasing rules when
	// simplifying integer expressions.

	case Instruction::GetElementPtr:
	return createNodeForGEP(cast<GEPOperator>(U));

	case Instruction::PHI:
	return createNodeForPHI(cast<PHINode>(U));

	case Instruction::Select:
	// U can also be a select constant expr, which let fall through. Since
	// createNodeForSelect only works for a condition that is an `ICmpInst`, and
	// constant expressions cannot have instructions as operands, we'd have
	// returned getUnknown for a select constant expressions anyway.
	if (isa<Instruction>(U))
	return createNodeForSelectOrPHI(cast<Instruction>(U), U->getOperand(0),
	U->getOperand(1), U->getOperand(2));
	break;

	case Instruction::Call:
	case Instruction::Invoke:
	if (Value *RV = CallSite(U).getReturnedArgOperand())
	return getSCEV(RV);
	break;
	}

	return getUnknown(V);
	}



	//===----------------------------------------------------------------------===//
	// Iteration Count Computation Code
	//

	static unsigned getConstantTripCount(const SCEVConstant *ExitCount) {
	if (!ExitCount)
	return 0;

	ConstantInt *ExitConst = ExitCount->getValue();

	// Guard against huge trip counts.
	if (ExitConst->getValue().getActiveBits() > 32)
	return 0;

	// In case of integer overflow, this returns 0, which is correct.
	return ((unsigned)ExitConst->getZExtValue()) + 1;
	}

	unsigned ScalarEvolution::getSmallConstantTripCount(const Loop *L) {
	if (BasicBlock *ExitingBB = L->getExitingBlock())
	return getSmallConstantTripCount(L, ExitingBB);

	// No trip count information for multiple exits.
	return 0;
	}

	unsigned ScalarEvolution::getSmallConstantTripCount(const Loop *L,
	BasicBlock *ExitingBlock) {
	assert(ExitingBlock && "Must pass a non-null exiting block!");
	assert(L->isLoopExiting(ExitingBlock) &&
	"Exiting block must actually branch out of the loop!");
	const SCEVConstant *ExitCount =
	dyn_cast<SCEVConstant>(getExitCount(L, ExitingBlock));
	return getConstantTripCount(ExitCount);
	}

	unsigned ScalarEvolution::getSmallConstantMaxTripCount(const Loop *L) {
	const auto *MaxExitCount =
	dyn_cast<SCEVConstant>(getMaxBackedgeTakenCount(L));
	return getConstantTripCount(MaxExitCount);
	}

	unsigned ScalarEvolution::getSmallConstantTripMultiple(const Loop *L) {
	if (BasicBlock *ExitingBB = L->getExitingBlock())
	return getSmallConstantTripMultiple(L, ExitingBB);

	// No trip multiple information for multiple exits.
	return 0;
	}

	/// Returns the largest constant divisor of the trip count of this loop as a
	/// normal unsigned value, if possible. This means that the actual trip count is
	/// always a multiple of the returned value (don't forget the trip count could
	/// very well be zero as well!).
	///
	/// Returns 1 if the trip count is unknown or not guaranteed to be the
	/// multiple of a constant (which is also the case if the trip count is simply
	/// constant, use getSmallConstantTripCount for that case), Will also return 1
	/// if the trip count is very large (>= 2^32).
	///
	/// As explained in the comments for getSmallConstantTripCount, this assumes
	/// that control exits the loop via ExitingBlock.
	unsigned
	ScalarEvolution::getSmallConstantTripMultiple(const Loop *L,
	BasicBlock *ExitingBlock) {
	assert(ExitingBlock && "Must pass a non-null exiting block!");
	assert(L->isLoopExiting(ExitingBlock) &&
	"Exiting block must actually branch out of the loop!");
	const SCEV *ExitCount = getExitCount(L, ExitingBlock);
	if (ExitCount == getCouldNotCompute())
	return 1;

	// Get the trip count from the BE count by adding 1.
	const SCEV *TCExpr = getAddExpr(ExitCount, getOne(ExitCount->getType()));

	const SCEVConstant *TC = dyn_cast<SCEVConstant>(TCExpr);
	if (!TC)
	// Attempt to factor more general cases. Returns the greatest power of
	// two divisor. If overflow happens, the trip count expression is still
	// divisible by the greatest power of 2 divisor returned.
	return 1U << std::min((uint32_t)31, GetMinTrailingZeros(TCExpr));

	ConstantInt *Result = TC->getValue();

	// Guard against huge trip counts (this requires checking
	// for zero to handle the case where the trip count == -1 and the
	// addition wraps).
	if (!Result \|\| Result->getValue().getActiveBits() > 32 \|\|
	Result->getValue().getActiveBits() == 0)
	return 1;

	return (unsigned)Result->getZExtValue();
	}

	/// Get the expression for the number of loop iterations for which this loop is
	/// guaranteed not to exit via ExitingBlock. Otherwise return
	/// SCEVCouldNotCompute.
	const SCEV ScalarEvolution::getExitCount(const Loop L,
	BasicBlock *ExitingBlock) {
	return getBackedgeTakenInfo(L).getExact(ExitingBlock, this);
	}

	const SCEV *
	ScalarEvolution::getPredicatedBackedgeTakenCount(const Loop *L,
	SCEVUnionPredicate &Preds) {
	return getPredicatedBackedgeTakenInfo(L).getExact(this, &Preds);
	}

	const SCEV ScalarEvolution::getBackedgeTakenCount(const Loop L) {
	return getBackedgeTakenInfo(L).getExact(this);
	}

	/// Similar to getBackedgeTakenCount, except return the least SCEV value that is
	/// known never to be less than the actual backedge taken count.
	const SCEV ScalarEvolution::getMaxBackedgeTakenCount(const Loop L) {
	return getBackedgeTakenInfo(L).getMax(this);
	}

	bool ScalarEvolution::isBackedgeTakenCountMaxOrZero(const Loop *L) {
	return getBackedgeTakenInfo(L).isMaxOrZero(this);
	}

	/// Push PHI nodes in the header of the given loop onto the given Worklist.
	static void
	PushLoopPHIs(const Loop L, SmallVectorImpl<Instruction > &Worklist) {
	BasicBlock *Header = L->getHeader();

	// Push all Loop-header PHIs onto the Worklist stack.
	for (BasicBlock::iterator I = Header->begin();
	PHINode *PN = dyn_cast<PHINode>(I); ++I)
	Worklist.push_back(PN);
	}

	const ScalarEvolution::BackedgeTakenInfo &
	ScalarEvolution::getPredicatedBackedgeTakenInfo(const Loop *L) {
	auto &BTI = getBackedgeTakenInfo(L);
	if (BTI.hasFullInfo())
	return BTI;

	auto Pair = PredicatedBackedgeTakenCounts.insert({L, BackedgeTakenInfo()});

	if (!Pair.second)
	return Pair.first->second;

	BackedgeTakenInfo Result =
	computeBackedgeTakenCount(L, /AllowPredicates=/true);

	return PredicatedBackedgeTakenCounts.find(L)->second = std::move(Result);
	}

	const ScalarEvolution::BackedgeTakenInfo &
	ScalarEvolution::getBackedgeTakenInfo(const Loop *L) {
	// Initially insert an invalid entry for this loop. If the insertion
	// succeeds, proceed to actually compute a backedge-taken count and
	// update the value. The temporary CouldNotCompute value tells SCEV
	// code elsewhere that it shouldn't attempt to request a new
	// backedge-taken count, which could result in infinite recursion.
	std::pair<DenseMap<const Loop *, BackedgeTakenInfo>::iterator, bool> Pair =
	BackedgeTakenCounts.insert({L, BackedgeTakenInfo()});
	if (!Pair.second)
	return Pair.first->second;

	// computeBackedgeTakenCount may allocate memory for its result. Inserting it
	// into the BackedgeTakenCounts map transfers ownership. Otherwise, the result
	// must be cleared in this scope.
	BackedgeTakenInfo Result = computeBackedgeTakenCount(L);

	if (Result.getExact(this) != getCouldNotCompute()) {
	assert(isLoopInvariant(Result.getExact(this), L) &&
	isLoopInvariant(Result.getMax(this), L) &&
	"Computed backedge-taken count isn't loop invariant for loop!");
	++NumTripCountsComputed;
	}
	else if (Result.getMax(this) == getCouldNotCompute() &&
	isa<PHINode>(L->getHeader()->begin())) {
	// Only count loops that have phi nodes as not being computable.
	++NumTripCountsNotComputed;
	}

	// Now that we know more about the trip count for this loop, forget any
	// existing SCEV values for PHI nodes in this loop since they are only
	// conservative estimates made without the benefit of trip count
	// information. This is similar to the code in forgetLoop, except that
	// it handles SCEVUnknown PHI nodes specially.
	if (Result.hasAnyInfo()) {
	SmallVector<Instruction *, 16> Worklist;
	PushLoopPHIs(L, Worklist);

	SmallPtrSet<Instruction *, 8> Visited;
	while (!Worklist.empty()) {
	Instruction *I = Worklist.pop_back_val();
	if (!Visited.insert(I).second)
	continue;

	ValueExprMapType::iterator It =
	ValueExprMap.find_as(static_cast<Value *>(I));
	if (It != ValueExprMap.end()) {
	const SCEV *Old = It->second;

	// SCEVUnknown for a PHI either means that it has an unrecognized
	// structure, or it's a PHI that's in the progress of being computed
	// by createNodeForPHI. In the former case, additional loop trip
	// count information isn't going to change anything. In the later
	// case, createNodeForPHI will perform the necessary updates on its
	// own when it gets to that point.
	if (!isa<PHINode>(I) \|\| !isa<SCEVUnknown>(Old)) {
	eraseValueFromMap(It->first);
	forgetMemoizedResults(Old);
	}
	if (PHINode *PN = dyn_cast<PHINode>(I))
	ConstantEvolutionLoopExitValue.erase(PN);
	}

	PushDefUseChildren(I, Worklist);
	}
	}

	// Re-lookup the insert position, since the call to
	// computeBackedgeTakenCount above could result in a
	// recusive call to getBackedgeTakenInfo (on a different
	// loop), which would invalidate the iterator computed
	// earlier.
	return BackedgeTakenCounts.find(L)->second = std::move(Result);
	}

	void ScalarEvolution::forgetLoop(const Loop *L) {
	// Drop any stored trip count value.
	auto RemoveLoopFromBackedgeMap =
	[L](DenseMap<const Loop *, BackedgeTakenInfo> &Map) {
	auto BTCPos = Map.find(L);
	if (BTCPos != Map.end()) {
	BTCPos->second.clear();
	Map.erase(BTCPos);
	}
	};

	RemoveLoopFromBackedgeMap(BackedgeTakenCounts);
	RemoveLoopFromBackedgeMap(PredicatedBackedgeTakenCounts);

	// Drop information about predicated SCEV rewrites for this loop.
	for (auto I = PredicatedSCEVRewrites.begin();
	I != PredicatedSCEVRewrites.end();) {
	std::pair<const SCEV , const Loop > Entry = I->first;
	if (Entry.second == L)
	PredicatedSCEVRewrites.erase(I++);
	else
	++I;
	}

	// Drop information about expressions based on loop-header PHIs.
	SmallVector<Instruction *, 16> Worklist;
	PushLoopPHIs(L, Worklist);

	SmallPtrSet<Instruction *, 8> Visited;
	while (!Worklist.empty()) {
	Instruction *I = Worklist.pop_back_val();
	if (!Visited.insert(I).second)
	continue;

	ValueExprMapType::iterator It =
	ValueExprMap.find_as(static_cast<Value *>(I));
	if (It != ValueExprMap.end()) {
	eraseValueFromMap(It->first);
	forgetMemoizedResults(It->second);
	if (PHINode *PN = dyn_cast<PHINode>(I))
	ConstantEvolutionLoopExitValue.erase(PN);
	}

	PushDefUseChildren(I, Worklist);
	}

	// Forget all contained loops too, to avoid dangling entries in the
	// ValuesAtScopes map.
	for (Loop I : L)
	forgetLoop(I);

	LoopPropertiesCache.erase(L);
	}

	void ScalarEvolution::forgetValue(Value *V) {
	Instruction *I = dyn_cast<Instruction>(V);
	if (!I) return;

	// Drop information about expressions based on loop-header PHIs.
	SmallVector<Instruction *, 16> Worklist;
	Worklist.push_back(I);

	SmallPtrSet<Instruction *, 8> Visited;
	while (!Worklist.empty()) {
	I = Worklist.pop_back_val();
	if (!Visited.insert(I).second)
	continue;

	ValueExprMapType::iterator It =
	ValueExprMap.find_as(static_cast<Value *>(I));
	if (It != ValueExprMap.end()) {
	eraseValueFromMap(It->first);
	forgetMemoizedResults(It->second);
	if (PHINode *PN = dyn_cast<PHINode>(I))
	ConstantEvolutionLoopExitValue.erase(PN);
	}

	PushDefUseChildren(I, Worklist);
	}
	}

	/// Get the exact loop backedge taken count considering all loop exits. A
	/// computable result can only be returned for loops with a single exit.
	/// Returning the minimum taken count among all exits is incorrect because one
	/// of the loop's exit limit's may have been skipped. howFarToZero assumes that
	/// the limit of each loop test is never skipped. This is a valid assumption as
	/// long as the loop exits via that test. For precise results, it is the
	/// caller's responsibility to specify the relevant loop exit using
	/// getExact(ExitingBlock, SE).
	const SCEV *
	ScalarEvolution::BackedgeTakenInfo::getExact(ScalarEvolution *SE,
	SCEVUnionPredicate *Preds) const {
	// If any exits were not computable, the loop is not computable.
	if (!isComplete() \|\| ExitNotTaken.empty())
	return SE->getCouldNotCompute();

	const SCEV *BECount = nullptr;
	for (auto &ENT : ExitNotTaken) {
	assert(ENT.ExactNotTaken != SE->getCouldNotCompute() && "bad exit SCEV");

	if (!BECount)
	BECount = ENT.ExactNotTaken;
	else if (BECount != ENT.ExactNotTaken)
	return SE->getCouldNotCompute();
	if (Preds && !ENT.hasAlwaysTruePredicate())
	Preds->add(ENT.Predicate.get());

	assert((Preds \|\| ENT.hasAlwaysTruePredicate()) &&
	"Predicate should be always true!");
	}

	assert(BECount && "Invalid not taken count for loop exit");
	return BECount;
	}

	/// Get the exact not taken count for this loop exit.
	const SCEV *
	ScalarEvolution::BackedgeTakenInfo::getExact(BasicBlock *ExitingBlock,
	ScalarEvolution *SE) const {
	for (auto &ENT : ExitNotTaken)
	if (ENT.ExitingBlock == ExitingBlock && ENT.hasAlwaysTruePredicate())
	return ENT.ExactNotTaken;

	return SE->getCouldNotCompute();
	}

	/// getMax - Get the max backedge taken count for the loop.
	const SCEV *
	ScalarEvolution::BackedgeTakenInfo::getMax(ScalarEvolution *SE) const {
	auto PredicateNotAlwaysTrue = [](const ExitNotTakenInfo &ENT) {
	return !ENT.hasAlwaysTruePredicate();
	};

	if (any_of(ExitNotTaken, PredicateNotAlwaysTrue) \|\| !getMax())
	return SE->getCouldNotCompute();

	assert((isa<SCEVCouldNotCompute>(getMax()) \|\| isa<SCEVConstant>(getMax())) &&
	"No point in having a non-constant max backedge taken count!");
	return getMax();
	}

	bool ScalarEvolution::BackedgeTakenInfo::isMaxOrZero(ScalarEvolution *SE) const {
	auto PredicateNotAlwaysTrue = [](const ExitNotTakenInfo &ENT) {
	return !ENT.hasAlwaysTruePredicate();
	};
	return MaxOrZero && !any_of(ExitNotTaken, PredicateNotAlwaysTrue);
	}

	bool ScalarEvolution::BackedgeTakenInfo::hasOperand(const SCEV *S,
	ScalarEvolution *SE) const {
	if (getMax() && getMax() != SE->getCouldNotCompute() &&
	SE->hasOperand(getMax(), S))
	return true;

	for (auto &ENT : ExitNotTaken)
	if (ENT.ExactNotTaken != SE->getCouldNotCompute() &&
	SE->hasOperand(ENT.ExactNotTaken, S))
	return true;

	return false;
	}

	ScalarEvolution::ExitLimit::ExitLimit(const SCEV *E)
	: ExactNotTaken(E), MaxNotTaken(E), MaxOrZero(false) {
	assert((isa<SCEVCouldNotCompute>(MaxNotTaken) \|\|
	isa<SCEVConstant>(MaxNotTaken)) &&
	"No point in having a non-constant max backedge taken count!");
	}

	ScalarEvolution::ExitLimit::ExitLimit(
	const SCEV E, const SCEV M, bool MaxOrZero,
	ArrayRef<const SmallPtrSetImpl<const SCEVPredicate > > PredSetList)
	: ExactNotTaken(E), MaxNotTaken(M), MaxOrZero(MaxOrZero) {
	assert((isa<SCEVCouldNotCompute>(ExactNotTaken) \|\|
	!isa<SCEVCouldNotCompute>(MaxNotTaken)) &&
	"Exact is not allowed to be less precise than Max");
	assert((isa<SCEVCouldNotCompute>(MaxNotTaken) \|\|
	isa<SCEVConstant>(MaxNotTaken)) &&
	"No point in having a non-constant max backedge taken count!");
	for (auto *PredSet : PredSetList)
	for (auto P : PredSet)
	addPredicate(P);
	}

	ScalarEvolution::ExitLimit::ExitLimit(
	const SCEV E, const SCEV M, bool MaxOrZero,
	const SmallPtrSetImpl<const SCEVPredicate *> &PredSet)
	: ExitLimit(E, M, MaxOrZero, {&PredSet}) {
	assert((isa<SCEVCouldNotCompute>(MaxNotTaken) \|\|
	isa<SCEVConstant>(MaxNotTaken)) &&
	"No point in having a non-constant max backedge taken count!");
	}

	ScalarEvolution::ExitLimit::ExitLimit(const SCEV E, const SCEV M,
	bool MaxOrZero)
	: ExitLimit(E, M, MaxOrZero, None) {
	assert((isa<SCEVCouldNotCompute>(MaxNotTaken) \|\|
	isa<SCEVConstant>(MaxNotTaken)) &&
	"No point in having a non-constant max backedge taken count!");
	}

	/// Allocate memory for BackedgeTakenInfo and copy the not-taken count of each
	/// computable exit into a persistent ExitNotTakenInfo array.
	ScalarEvolution::BackedgeTakenInfo::BackedgeTakenInfo(
	SmallVectorImpl<ScalarEvolution::BackedgeTakenInfo::EdgeExitInfo>
	&&ExitCounts,
	bool Complete, const SCEV *MaxCount, bool MaxOrZero)
	: MaxAndComplete(MaxCount, Complete), MaxOrZero(MaxOrZero) {
	typedef ScalarEvolution::BackedgeTakenInfo::EdgeExitInfo EdgeExitInfo;
	ExitNotTaken.reserve(ExitCounts.size());
	std::transform(
	ExitCounts.begin(), ExitCounts.end(), std::back_inserter(ExitNotTaken),
	[&](const EdgeExitInfo &EEI) {
	BasicBlock *ExitBB = EEI.first;
	const ExitLimit &EL = EEI.second;
	if (EL.Predicates.empty())
	return ExitNotTakenInfo(ExitBB, EL.ExactNotTaken, nullptr);

	std::unique_ptr<SCEVUnionPredicate> Predicate(new SCEVUnionPredicate);
	for (auto *Pred : EL.Predicates)
	Predicate->add(Pred);

	return ExitNotTakenInfo(ExitBB, EL.ExactNotTaken, std::move(Predicate));
	});
	assert((isa<SCEVCouldNotCompute>(MaxCount) \|\| isa<SCEVConstant>(MaxCount)) &&
	"No point in having a non-constant max backedge taken count!");
	}

	/// Invalidate this result and free the ExitNotTakenInfo array.
	void ScalarEvolution::BackedgeTakenInfo::clear() {
	ExitNotTaken.clear();
	}

	/// Compute the number of times the backedge of the specified loop will execute.
	ScalarEvolution::BackedgeTakenInfo
	ScalarEvolution::computeBackedgeTakenCount(const Loop *L,
	bool AllowPredicates) {
	SmallVector<BasicBlock *, 8> ExitingBlocks;
	L->getExitingBlocks(ExitingBlocks);

	typedef ScalarEvolution::BackedgeTakenInfo::EdgeExitInfo EdgeExitInfo;

	SmallVector<EdgeExitInfo, 4> ExitCounts;
	bool CouldComputeBECount = true;
	BasicBlock *Latch = L->getLoopLatch(); // may be NULL.
	const SCEV *MustExitMaxBECount = nullptr;
	const SCEV *MayExitMaxBECount = nullptr;
	bool MustExitMaxOrZero = false;

	// Compute the ExitLimit for each loop exit. Use this to populate ExitCounts
	// and compute maxBECount.
	// Do a union of all the predicates here.
	for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
	BasicBlock *ExitBB = ExitingBlocks[i];
	ExitLimit EL = computeExitLimit(L, ExitBB, AllowPredicates);

	assert((AllowPredicates \|\| EL.Predicates.empty()) &&
	"Predicated exit limit when predicates are not allowed!");

	// 1. For each exit that can be computed, add an entry to ExitCounts.
	// CouldComputeBECount is true only if all exits can be computed.
	if (EL.ExactNotTaken == getCouldNotCompute())
	// We couldn't compute an exact value for this exit, so
	// we won't be able to compute an exact value for the loop.
	CouldComputeBECount = false;
	else
	ExitCounts.emplace_back(ExitBB, EL);

	// 2. Derive the loop's MaxBECount from each exit's max number of
	// non-exiting iterations. Partition the loop exits into two kinds:
	// LoopMustExits and LoopMayExits.
	//
	// If the exit dominates the loop latch, it is a LoopMustExit otherwise it
	// is a LoopMayExit. If any computable LoopMustExit is found, then
	// MaxBECount is the minimum EL.MaxNotTaken of computable
	// LoopMustExits. Otherwise, MaxBECount is conservatively the maximum
	// EL.MaxNotTaken, where CouldNotCompute is considered greater than any
	// computable EL.MaxNotTaken.
	if (EL.MaxNotTaken != getCouldNotCompute() && Latch &&
	DT.dominates(ExitBB, Latch)) {
	if (!MustExitMaxBECount) {
	MustExitMaxBECount = EL.MaxNotTaken;
	MustExitMaxOrZero = EL.MaxOrZero;
	} else {
	MustExitMaxBECount =
	getUMinFromMismatchedTypes(MustExitMaxBECount, EL.MaxNotTaken);
	}
	} else if (MayExitMaxBECount != getCouldNotCompute()) {
	if (!MayExitMaxBECount \|\| EL.MaxNotTaken == getCouldNotCompute())
	MayExitMaxBECount = EL.MaxNotTaken;
	else {
	MayExitMaxBECount =
	getUMaxFromMismatchedTypes(MayExitMaxBECount, EL.MaxNotTaken);
	}
	}
	}
	const SCEV *MaxBECount = MustExitMaxBECount ? MustExitMaxBECount :
	(MayExitMaxBECount ? MayExitMaxBECount : getCouldNotCompute());
	// The loop backedge will be taken the maximum or zero times if there's
	// a single exit that must be taken the maximum or zero times.
	bool MaxOrZero = (MustExitMaxOrZero && ExitingBlocks.size() == 1);
	return BackedgeTakenInfo(std::move(ExitCounts), CouldComputeBECount,
	MaxBECount, MaxOrZero);
	}

	ScalarEvolution::ExitLimit
	ScalarEvolution::computeExitLimit(const Loop L, BasicBlock ExitingBlock,
	bool AllowPredicates) {

	// Okay, we've chosen an exiting block. See what condition causes us to exit
	// at this block and remember the exit block and whether all other targets
	// lead to the loop header.
	bool MustExecuteLoopHeader = true;
	BasicBlock *Exit = nullptr;
	for (auto *SBB : successors(ExitingBlock))
	if (!L->contains(SBB)) {
	if (Exit) // Multiple exit successors.
	return getCouldNotCompute();
	Exit = SBB;
	} else if (SBB != L->getHeader()) {
	MustExecuteLoopHeader = false;
	}

	// At this point, we know we have a conditional branch that determines whether
	// the loop is exited. However, we don't know if the branch is executed each
	// time through the loop. If not, then the execution count of the branch will
	// not be equal to the trip count of the loop.
	//
	// Currently we check for this by checking to see if the Exit branch goes to
	// the loop header. If so, we know it will always execute the same number of
	// times as the loop. We also handle the case where the exit block is the
	// loop header. This is common for un-rotated loops.
	//
	// If both of those tests fail, walk up the unique predecessor chain to the
	// header, stopping if there is an edge that doesn't exit the loop. If the
	// header is reached, the execution count of the branch will be equal to the
	// trip count of the loop.
	//
	// More extensive analysis could be done to handle more cases here.
	//
	if (!MustExecuteLoopHeader && ExitingBlock != L->getHeader()) {
	// The simple checks failed, try climbing the unique predecessor chain
	// up to the header.
	bool Ok = false;
	for (BasicBlock *BB = ExitingBlock; BB; ) {
	BasicBlock *Pred = BB->getUniquePredecessor();
	if (!Pred)
	return getCouldNotCompute();
	TerminatorInst *PredTerm = Pred->getTerminator();
	for (const BasicBlock *PredSucc : PredTerm->successors()) {
	if (PredSucc == BB)
	continue;
	// If the predecessor has a successor that isn't BB and isn't
	// outside the loop, assume the worst.
	if (L->contains(PredSucc))
	return getCouldNotCompute();
	}
	if (Pred == L->getHeader()) {
	Ok = true;
	break;
	}
	BB = Pred;
	}
	if (!Ok)
	return getCouldNotCompute();
	}

	bool IsOnlyExit = (L->getExitingBlock() != nullptr);
	TerminatorInst *Term = ExitingBlock->getTerminator();
	if (BranchInst *BI = dyn_cast<BranchInst>(Term)) {
	assert(BI->isConditional() && "If unconditional, it can't be in loop!");
	// Proceed to the next level to examine the exit condition expression.
	return computeExitLimitFromCond(
	L, BI->getCondition(), BI->getSuccessor(0), BI->getSuccessor(1),
	/ControlsExit=/IsOnlyExit, AllowPredicates);
	}

	if (SwitchInst *SI = dyn_cast<SwitchInst>(Term))
	return computeExitLimitFromSingleExitSwitch(L, SI, Exit,
	/ControlsExit=/IsOnlyExit);

	return getCouldNotCompute();
	}

	ScalarEvolution::ExitLimit ScalarEvolution::computeExitLimitFromCond(
	const Loop L, Value ExitCond, BasicBlock TBB, BasicBlock FBB,
	bool ControlsExit, bool AllowPredicates) {
	ScalarEvolution::ExitLimitCacheTy Cache(L, TBB, FBB, AllowPredicates);
	return computeExitLimitFromCondCached(Cache, L, ExitCond, TBB, FBB,
	ControlsExit, AllowPredicates);
	}

	Optional<ScalarEvolution::ExitLimit>
	ScalarEvolution::ExitLimitCache::find(const Loop L, Value ExitCond,
	BasicBlock TBB, BasicBlock FBB,
	bool ControlsExit, bool AllowPredicates) {
	(void)this->L;
	(void)this->TBB;
	(void)this->FBB;
	(void)this->AllowPredicates;

	assert(this->L == L && this->TBB == TBB && this->FBB == FBB &&
	this->AllowPredicates == AllowPredicates &&
	"Variance in assumed invariant key components!");
	auto Itr = TripCountMap.find({ExitCond, ControlsExit});
	if (Itr == TripCountMap.end())
	return None;
	return Itr->second;
	}

	void ScalarEvolution::ExitLimitCache::insert(const Loop L, Value ExitCond,
	BasicBlock TBB, BasicBlock FBB,
	bool ControlsExit,
	bool AllowPredicates,
	const ExitLimit &EL) {
	assert(this->L == L && this->TBB == TBB && this->FBB == FBB &&
	this->AllowPredicates == AllowPredicates &&
	"Variance in assumed invariant key components!");

	auto InsertResult = TripCountMap.insert({{ExitCond, ControlsExit}, EL});
	assert(InsertResult.second && "Expected successful insertion!");
	(void)InsertResult;
	}

	ScalarEvolution::ExitLimit ScalarEvolution::computeExitLimitFromCondCached(
	ExitLimitCacheTy &Cache, const Loop L, Value ExitCond, BasicBlock *TBB,
	BasicBlock *FBB, bool ControlsExit, bool AllowPredicates) {

	if (auto MaybeEL =
	Cache.find(L, ExitCond, TBB, FBB, ControlsExit, AllowPredicates))
	return *MaybeEL;

	ExitLimit EL = computeExitLimitFromCondImpl(Cache, L, ExitCond, TBB, FBB,
	ControlsExit, AllowPredicates);
	Cache.insert(L, ExitCond, TBB, FBB, ControlsExit, AllowPredicates, EL);
	return EL;
	}

	ScalarEvolution::ExitLimit ScalarEvolution::computeExitLimitFromCondImpl(
	ExitLimitCacheTy &Cache, const Loop L, Value ExitCond, BasicBlock *TBB,
	BasicBlock *FBB, bool ControlsExit, bool AllowPredicates) {
	// Check if the controlling expression for this loop is an And or Or.
	if (BinaryOperator *BO = dyn_cast<BinaryOperator>(ExitCond)) {
	if (BO->getOpcode() == Instruction::And) {
	// Recurse on the operands of the and.
	bool EitherMayExit = L->contains(TBB);
	ExitLimit EL0 = computeExitLimitFromCondCached(
	Cache, L, BO->getOperand(0), TBB, FBB, ControlsExit && !EitherMayExit,
	AllowPredicates);
	ExitLimit EL1 = computeExitLimitFromCondCached(
	Cache, L, BO->getOperand(1), TBB, FBB, ControlsExit && !EitherMayExit,
	AllowPredicates);
	const SCEV *BECount = getCouldNotCompute();
	const SCEV *MaxBECount = getCouldNotCompute();
	if (EitherMayExit) {
	// Both conditions must be true for the loop to continue executing.
	// Choose the less conservative count.
	if (EL0.ExactNotTaken == getCouldNotCompute() \|\|
	EL1.ExactNotTaken == getCouldNotCompute())
	BECount = getCouldNotCompute();
	else
	BECount =
	getUMinFromMismatchedTypes(EL0.ExactNotTaken, EL1.ExactNotTaken);
	if (EL0.MaxNotTaken == getCouldNotCompute())
	MaxBECount = EL1.MaxNotTaken;
	else if (EL1.MaxNotTaken == getCouldNotCompute())
	MaxBECount = EL0.MaxNotTaken;
	else
	MaxBECount =
	getUMinFromMismatchedTypes(EL0.MaxNotTaken, EL1.MaxNotTaken);
	} else {
	// Both conditions must be true at the same time for the loop to exit.
	// For now, be conservative.
	assert(L->contains(FBB) && "Loop block has no successor in loop!");
	if (EL0.MaxNotTaken == EL1.MaxNotTaken)
	MaxBECount = EL0.MaxNotTaken;
	if (EL0.ExactNotTaken == EL1.ExactNotTaken)
	BECount = EL0.ExactNotTaken;
	}

	// There are cases (e.g. PR26207) where computeExitLimitFromCond is able
	// to be more aggressive when computing BECount than when computing
	// MaxBECount. In these cases it is possible for EL0.ExactNotTaken and
	// EL1.ExactNotTaken to match, but for EL0.MaxNotTaken and EL1.MaxNotTaken
	// to not.
	if (isa<SCEVCouldNotCompute>(MaxBECount) &&
	!isa<SCEVCouldNotCompute>(BECount))
	MaxBECount = getConstant(getUnsignedRangeMax(BECount));

	return ExitLimit(BECount, MaxBECount, false,
	{&EL0.Predicates, &EL1.Predicates});
	}
	if (BO->getOpcode() == Instruction::Or) {
	// Recurse on the operands of the or.
	bool EitherMayExit = L->contains(FBB);
	ExitLimit EL0 = computeExitLimitFromCondCached(
	Cache, L, BO->getOperand(0), TBB, FBB, ControlsExit && !EitherMayExit,
	AllowPredicates);
	ExitLimit EL1 = computeExitLimitFromCondCached(
	Cache, L, BO->getOperand(1), TBB, FBB, ControlsExit && !EitherMayExit,
	AllowPredicates);
	const SCEV *BECount = getCouldNotCompute();
	const SCEV *MaxBECount = getCouldNotCompute();
	if (EitherMayExit) {
	// Both conditions must be false for the loop to continue executing.
	// Choose the less conservative count.
	if (EL0.ExactNotTaken == getCouldNotCompute() \|\|
	EL1.ExactNotTaken == getCouldNotCompute())
	BECount = getCouldNotCompute();
	else
	BECount =
	getUMinFromMismatchedTypes(EL0.ExactNotTaken, EL1.ExactNotTaken);
	if (EL0.MaxNotTaken == getCouldNotCompute())
	MaxBECount = EL1.MaxNotTaken;
	else if (EL1.MaxNotTaken == getCouldNotCompute())
	MaxBECount = EL0.MaxNotTaken;
	else
	MaxBECount =
	getUMinFromMismatchedTypes(EL0.MaxNotTaken, EL1.MaxNotTaken);
	} else {
	// Both conditions must be false at the same time for the loop to exit.
	// For now, be conservative.
	assert(L->contains(TBB) && "Loop block has no successor in loop!");
	if (EL0.MaxNotTaken == EL1.MaxNotTaken)
	MaxBECount = EL0.MaxNotTaken;
	if (EL0.ExactNotTaken == EL1.ExactNotTaken)
	BECount = EL0.ExactNotTaken;
	}

	return ExitLimit(BECount, MaxBECount, false,
	{&EL0.Predicates, &EL1.Predicates});
	}
	}

	// With an icmp, it may be feasible to compute an exact backedge-taken count.
	// Proceed to the next level to examine the icmp.
	if (ICmpInst *ExitCondICmp = dyn_cast<ICmpInst>(ExitCond)) {
	ExitLimit EL =
	computeExitLimitFromICmp(L, ExitCondICmp, TBB, FBB, ControlsExit);
	if (EL.hasFullInfo() \|\| !AllowPredicates)
	return EL;

	// Try again, but use SCEV predicates this time.
	return computeExitLimitFromICmp(L, ExitCondICmp, TBB, FBB, ControlsExit,
	/AllowPredicates=/true);
	}

	// Check for a constant condition. These are normally stripped out by
	// SimplifyCFG, but ScalarEvolution may be used by a pass which wishes to
	// preserve the CFG and is temporarily leaving constant conditions
	// in place.
	if (ConstantInt *CI = dyn_cast<ConstantInt>(ExitCond)) {
	if (L->contains(FBB) == !CI->getZExtValue())
	// The backedge is always taken.
	return getCouldNotCompute();
	else
	// The backedge is never taken.
	return getZero(CI->getType());
	}

	// If it's not an integer or pointer comparison then compute it the hard way.
	return computeExitCountExhaustively(L, ExitCond, !L->contains(TBB));
	}

	ScalarEvolution::ExitLimit
	ScalarEvolution::computeExitLimitFromICmp(const Loop *L,
	ICmpInst *ExitCond,
	BasicBlock *TBB,
	BasicBlock *FBB,
	bool ControlsExit,
	bool AllowPredicates) {

	// If the condition was exit on true, convert the condition to exit on false
	ICmpInst::Predicate Cond;
	if (!L->contains(FBB))
	Cond = ExitCond->getPredicate();
	else
	Cond = ExitCond->getInversePredicate();

	// Handle common loops like: for (X = "string"; *X; ++X)
	if (LoadInst *LI = dyn_cast<LoadInst>(ExitCond->getOperand(0)))
	if (Constant *RHS = dyn_cast<Constant>(ExitCond->getOperand(1))) {
	ExitLimit ItCnt =
	computeLoadConstantCompareExitLimit(LI, RHS, L, Cond);
	if (ItCnt.hasAnyInfo())
	return ItCnt;
	}

	const SCEV *LHS = getSCEV(ExitCond->getOperand(0));
	const SCEV *RHS = getSCEV(ExitCond->getOperand(1));

	// Try to evaluate any dependencies out of the loop.
	LHS = getSCEVAtScope(LHS, L);
	RHS = getSCEVAtScope(RHS, L);

	// At this point, we would like to compute how many iterations of the
	// loop the predicate will return true for these inputs.
	if (isLoopInvariant(LHS, L) && !isLoopInvariant(RHS, L)) {
	// If there is a loop-invariant, force it into the RHS.
	std::swap(LHS, RHS);
	Cond = ICmpInst::getSwappedPredicate(Cond);
	}

	// Simplify the operands before analyzing them.
	(void)SimplifyICmpOperands(Cond, LHS, RHS);

	// If we have a comparison of a chrec against a constant, try to use value
	// ranges to answer this query.
	if (const SCEVConstant *RHSC = dyn_cast<SCEVConstant>(RHS))
	if (const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(LHS))
	if (AddRec->getLoop() == L) {
	// Form the constant range.
	ConstantRange CompRange =
	ConstantRange::makeExactICmpRegion(Cond, RHSC->getAPInt());

	const SCEV Ret = AddRec->getNumIterationsInRange(CompRange, this);
	if (!isa<SCEVCouldNotCompute>(Ret)) return Ret;
	}

	switch (Cond) {
	case ICmpInst::ICMP_NE: { // while (X != Y)
	// Convert to: while (X-Y != 0)
	ExitLimit EL = howFarToZero(getMinusSCEV(LHS, RHS), L, ControlsExit,
	AllowPredicates);
	if (EL.hasAnyInfo()) return EL;
	break;
	}
	case ICmpInst::ICMP_EQ: { // while (X == Y)
	// Convert to: while (X-Y == 0)
	ExitLimit EL = howFarToNonZero(getMinusSCEV(LHS, RHS), L);
	if (EL.hasAnyInfo()) return EL;
	break;
	}
	case ICmpInst::ICMP_SLT:
	case ICmpInst::ICMP_ULT: { // while (X < Y)
	bool IsSigned = Cond == ICmpInst::ICMP_SLT;
	ExitLimit EL = howManyLessThans(LHS, RHS, L, IsSigned, ControlsExit,
	AllowPredicates);
	if (EL.hasAnyInfo()) return EL;
	break;
	}
	case ICmpInst::ICMP_SGT:
	case ICmpInst::ICMP_UGT: { // while (X > Y)
	bool IsSigned = Cond == ICmpInst::ICMP_SGT;
	ExitLimit EL =
	howManyGreaterThans(LHS, RHS, L, IsSigned, ControlsExit,
	AllowPredicates);
	if (EL.hasAnyInfo()) return EL;
	break;
	}
	default:
	break;
	}

	auto *ExhaustiveCount =
	computeExitCountExhaustively(L, ExitCond, !L->contains(TBB));

	if (!isa<SCEVCouldNotCompute>(ExhaustiveCount))
	return ExhaustiveCount;

	return computeShiftCompareExitLimit(ExitCond->getOperand(0),
	ExitCond->getOperand(1), L, Cond);
	}

	ScalarEvolution::ExitLimit
	ScalarEvolution::computeExitLimitFromSingleExitSwitch(const Loop *L,
	SwitchInst *Switch,
	BasicBlock *ExitingBlock,
	bool ControlsExit) {
	assert(!L->contains(ExitingBlock) && "Not an exiting block!");

	// Give up if the exit is the default dest of a switch.
	if (Switch->getDefaultDest() == ExitingBlock)
	return getCouldNotCompute();

	assert(L->contains(Switch->getDefaultDest()) &&
	"Default case must not exit the loop!");
	const SCEV *LHS = getSCEVAtScope(Switch->getCondition(), L);
	const SCEV *RHS = getConstant(Switch->findCaseDest(ExitingBlock));

	// while (X != Y) --> while (X-Y != 0)
	ExitLimit EL = howFarToZero(getMinusSCEV(LHS, RHS), L, ControlsExit);
	if (EL.hasAnyInfo())
	return EL;

	return getCouldNotCompute();
	}

	static ConstantInt *
	EvaluateConstantChrecAtConstant(const SCEVAddRecExpr AddRec, ConstantInt C,
	ScalarEvolution &SE) {
	const SCEV *InVal = SE.getConstant(C);
	const SCEV *Val = AddRec->evaluateAtIteration(InVal, SE);
	assert(isa<SCEVConstant>(Val) &&
	"Evaluation of SCEV at constant didn't fold correctly?");
	return cast<SCEVConstant>(Val)->getValue();
	}

	/// Given an exit condition of 'icmp op load X, cst', try to see if we can
	/// compute the backedge execution count.
	ScalarEvolution::ExitLimit
	ScalarEvolution::computeLoadConstantCompareExitLimit(
	LoadInst *LI,
	Constant *RHS,
	const Loop *L,
	ICmpInst::Predicate predicate) {

	if (LI->isVolatile()) return getCouldNotCompute();

	// Check to see if the loaded pointer is a getelementptr of a global.
	// TODO: Use SCEV instead of manually grubbing with GEPs.
	GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(LI->getOperand(0));
	if (!GEP) return getCouldNotCompute();

	// Make sure that it is really a constant global we are gepping, with an
	// initializer, and make sure the first IDX is really 0.
	GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0));
	if (!GV \|\| !GV->isConstant() \|\| !GV->hasDefinitiveInitializer() \|\|
	GEP->getNumOperands() < 3 \|\| !isa<Constant>(GEP->getOperand(1)) \|\|
	!cast<Constant>(GEP->getOperand(1))->isNullValue())
	return getCouldNotCompute();

	// Okay, we allow one non-constant index into the GEP instruction.
	Value *VarIdx = nullptr;
	std::vector<Constant*> Indexes;
	unsigned VarIdxNum = 0;
	for (unsigned i = 2, e = GEP->getNumOperands(); i != e; ++i)
	if (ConstantInt *CI = dyn_cast<ConstantInt>(GEP->getOperand(i))) {
	Indexes.push_back(CI);
	} else if (!isa<ConstantInt>(GEP->getOperand(i))) {
	if (VarIdx) return getCouldNotCompute(); // Multiple non-constant idx's.
	VarIdx = GEP->getOperand(i);
	VarIdxNum = i-2;
	Indexes.push_back(nullptr);
	}

	// Loop-invariant loads may be a byproduct of loop optimization. Skip them.
	if (!VarIdx)
	return getCouldNotCompute();

	// Okay, we know we have a (load (gep GV, 0, X)) comparison with a constant.
	// Check to see if X is a loop variant variable value now.
	const SCEV *Idx = getSCEV(VarIdx);
	Idx = getSCEVAtScope(Idx, L);

	// We can only recognize very limited forms of loop index expressions, in
	// particular, only affine AddRec's like {C1,+,C2}.
	const SCEVAddRecExpr *IdxExpr = dyn_cast<SCEVAddRecExpr>(Idx);
	if (!IdxExpr \|\| !IdxExpr->isAffine() \|\| isLoopInvariant(IdxExpr, L) \|\|
	!isa<SCEVConstant>(IdxExpr->getOperand(0)) \|\|
	!isa<SCEVConstant>(IdxExpr->getOperand(1)))
	return getCouldNotCompute();

	unsigned MaxSteps = MaxBruteForceIterations;
	for (unsigned IterationNum = 0; IterationNum != MaxSteps; ++IterationNum) {
	ConstantInt *ItCst = ConstantInt::get(
	cast<IntegerType>(IdxExpr->getType()), IterationNum);
	ConstantInt Val = EvaluateConstantChrecAtConstant(IdxExpr, ItCst, this);

	// Form the GEP offset.
	Indexes[VarIdxNum] = Val;

	Constant *Result = ConstantFoldLoadThroughGEPIndices(GV->getInitializer(),
	Indexes);
	if (!Result) break; // Cannot compute!

	// Evaluate the condition for this iteration.
	Result = ConstantExpr::getICmp(predicate, Result, RHS);
	if (!isa<ConstantInt>(Result)) break; // Couldn't decide for sure
	if (cast<ConstantInt>(Result)->getValue().isMinValue()) {
	++NumArrayLenItCounts;
	return getConstant(ItCst); // Found terminating iteration!
	}
	}
	return getCouldNotCompute();
	}

	ScalarEvolution::ExitLimit ScalarEvolution::computeShiftCompareExitLimit(
	Value LHS, Value RHSV, const Loop *L, ICmpInst::Predicate Pred) {
	ConstantInt *RHS = dyn_cast<ConstantInt>(RHSV);
	if (!RHS)
	return getCouldNotCompute();

	const BasicBlock *Latch = L->getLoopLatch();
	if (!Latch)
	return getCouldNotCompute();

	const BasicBlock *Predecessor = L->getLoopPredecessor();
	if (!Predecessor)
	return getCouldNotCompute();

	// Return true if V is of the form "LHS `shift_op` <positive constant>".
	// Return LHS in OutLHS and shift_opt in OutOpCode.
	auto MatchPositiveShift =
	[](Value V, Value &OutLHS, Instruction::BinaryOps &OutOpCode) {

	using namespace PatternMatch;

	ConstantInt *ShiftAmt;
	if (match(V, m_LShr(m_Value(OutLHS), m_ConstantInt(ShiftAmt))))
	OutOpCode = Instruction::LShr;
	else if (match(V, m_AShr(m_Value(OutLHS), m_ConstantInt(ShiftAmt))))
	OutOpCode = Instruction::AShr;
	else if (match(V, m_Shl(m_Value(OutLHS), m_ConstantInt(ShiftAmt))))
	OutOpCode = Instruction::Shl;
	else
	return false;

	return ShiftAmt->getValue().isStrictlyPositive();
	};

	// Recognize a "shift recurrence" either of the form %iv or of %iv.shifted in
	//
	// loop:
	// %iv = phi i32 [ %iv.shifted, %loop ], [ %val, %preheader ]
	// %iv.shifted = lshr i32 %iv, <positive constant>
	//
	// Return true on a successful match. Return the corresponding PHI node (%iv
	// above) in PNOut and the opcode of the shift operation in OpCodeOut.
	auto MatchShiftRecurrence =
	[&](Value V, PHINode &PNOut, Instruction::BinaryOps &OpCodeOut) {
	Optional<Instruction::BinaryOps> PostShiftOpCode;

	{
	Instruction::BinaryOps OpC;
	Value *V;

	// If we encounter a shift instruction, "peel off" the shift operation,
	// and remember that we did so. Later when we inspect %iv's backedge
	// value, we will make sure that the backedge value uses the same
	// operation.
	//
	// Note: the peeled shift operation does not have to be the same
	// instruction as the one feeding into the PHI's backedge value. We only
	// really care about it being the same kind of shift instruction --
	// that's all that is required for our later inferences to hold.
	if (MatchPositiveShift(LHS, V, OpC)) {
	PostShiftOpCode = OpC;
	LHS = V;
	}
	}

	PNOut = dyn_cast<PHINode>(LHS);
	if (!PNOut \|\| PNOut->getParent() != L->getHeader())
	return false;

	Value *BEValue = PNOut->getIncomingValueForBlock(Latch);
	Value *OpLHS;

	return
	// The backedge value for the PHI node must be a shift by a positive
	// amount
	MatchPositiveShift(BEValue, OpLHS, OpCodeOut) &&

	// of the PHI node itself
	OpLHS == PNOut &&

	// and the kind of shift should be match the kind of shift we peeled
	// off, if any.
	(!PostShiftOpCode.hasValue() \|\| *PostShiftOpCode == OpCodeOut);
	};

	PHINode *PN;
	Instruction::BinaryOps OpCode;
	if (!MatchShiftRecurrence(LHS, PN, OpCode))
	return getCouldNotCompute();

	const DataLayout &DL = getDataLayout();

	// The key rationale for this optimization is that for some kinds of shift
	// recurrences, the value of the recurrence "stabilizes" to either 0 or -1
	// within a finite number of iterations. If the condition guarding the
	// backedge (in the sense that the backedge is taken if the condition is true)
	// is false for the value the shift recurrence stabilizes to, then we know
	// that the backedge is taken only a finite number of times.

	ConstantInt *StableValue = nullptr;
	switch (OpCode) {
	default:
	llvm_unreachable("Impossible case!");

	case Instruction::AShr: {
	// {K,ashr,<positive-constant>} stabilizes to signum(K) in at most
	// bitwidth(K) iterations.
	Value *FirstValue = PN->getIncomingValueForBlock(Predecessor);
	KnownBits Known = computeKnownBits(FirstValue, DL, 0, nullptr,
	Predecessor->getTerminator(), &DT);
	auto *Ty = cast<IntegerType>(RHS->getType());
	if (Known.isNonNegative())
	StableValue = ConstantInt::get(Ty, 0);
	else if (Known.isNegative())
	StableValue = ConstantInt::get(Ty, -1, true);
	else
	return getCouldNotCompute();

	break;
	}
	case Instruction::LShr:
	case Instruction::Shl:
	// Both {K,lshr,<positive-constant>} and {K,shl,<positive-constant>}
	// stabilize to 0 in at most bitwidth(K) iterations.
	StableValue = ConstantInt::get(cast<IntegerType>(RHS->getType()), 0);
	break;
	}

	auto *Result =
	ConstantFoldCompareInstOperands(Pred, StableValue, RHS, DL, &TLI);
	assert(Result->getType()->isIntegerTy(1) &&
	"Otherwise cannot be an operand to a branch instruction");

	if (Result->isZeroValue()) {
	unsigned BitWidth = getTypeSizeInBits(RHS->getType());
	const SCEV *UpperBound =
	getConstant(getEffectiveSCEVType(RHS->getType()), BitWidth);
	return ExitLimit(getCouldNotCompute(), UpperBound, false);
	}

	return getCouldNotCompute();
	}

	/// Return true if we can constant fold an instruction of the specified type,
	/// assuming that all operands were constants.
	static bool CanConstantFold(const Instruction *I) {
	if (isa<BinaryOperator>(I) \|\| isa<CmpInst>(I) \|\|
	isa<SelectInst>(I) \|\| isa<CastInst>(I) \|\| isa<GetElementPtrInst>(I) \|\|
	isa<LoadInst>(I))
	return true;

	if (const CallInst *CI = dyn_cast<CallInst>(I))
	if (const Function *F = CI->getCalledFunction())
	return canConstantFoldCallTo(CI, F);
	return false;
	}

	/// Determine whether this instruction can constant evolve within this loop
	/// assuming its operands can all constant evolve.
	static bool canConstantEvolve(Instruction I, const Loop L) {
	// An instruction outside of the loop can't be derived from a loop PHI.
	if (!L->contains(I)) return false;

	if (isa<PHINode>(I)) {
	// We don't currently keep track of the control flow needed to evaluate
	// PHIs, so we cannot handle PHIs inside of loops.
	return L->getHeader() == I->getParent();
	}

	// If we won't be able to constant fold this expression even if the operands
	// are constants, bail early.
	return CanConstantFold(I);
	}

	/// getConstantEvolvingPHIOperands - Implement getConstantEvolvingPHI by
	/// recursing through each instruction operand until reaching a loop header phi.
	static PHINode *
	getConstantEvolvingPHIOperands(Instruction UseInst, const Loop L,
	DenseMap<Instruction , PHINode > &PHIMap,
	unsigned Depth) {
	if (Depth > MaxConstantEvolvingDepth)
	return nullptr;

	// Otherwise, we can evaluate this instruction if all of its operands are
	// constant or derived from a PHI node themselves.
	PHINode *PHI = nullptr;
	for (Value *Op : UseInst->operands()) {
	if (isa<Constant>(Op)) continue;

	Instruction *OpInst = dyn_cast<Instruction>(Op);
	if (!OpInst \|\| !canConstantEvolve(OpInst, L)) return nullptr;

	PHINode *P = dyn_cast<PHINode>(OpInst);
	if (!P)
	// If this operand is already visited, reuse the prior result.
	// We may have P != PHI if this is the deepest point at which the
	// inconsistent paths meet.
	P = PHIMap.lookup(OpInst);
	if (!P) {
	// Recurse and memoize the results, whether a phi is found or not.
	// This recursive call invalidates pointers into PHIMap.
	P = getConstantEvolvingPHIOperands(OpInst, L, PHIMap, Depth + 1);
	PHIMap[OpInst] = P;
	}
	if (!P)
	return nullptr; // Not evolving from PHI
	if (PHI && PHI != P)
	return nullptr; // Evolving from multiple different PHIs.
	PHI = P;
	}
	// This is a expression evolving from a constant PHI!
	return PHI;
	}

	/// getConstantEvolvingPHI - Given an LLVM value and a loop, return a PHI node
	/// in the loop that V is derived from. We allow arbitrary operations along the
	/// way, but the operands of an operation must either be constants or a value
	/// derived from a constant PHI. If this expression does not fit with these
	/// constraints, return null.
	static PHINode getConstantEvolvingPHI(Value V, const Loop *L) {
	Instruction *I = dyn_cast<Instruction>(V);
	if (!I \|\| !canConstantEvolve(I, L)) return nullptr;

	if (PHINode *PN = dyn_cast<PHINode>(I))
	return PN;

	// Record non-constant instructions contained by the loop.
	DenseMap<Instruction , PHINode > PHIMap;
	return getConstantEvolvingPHIOperands(I, L, PHIMap, 0);
	}

	/// EvaluateExpression - Given an expression that passes the
	/// getConstantEvolvingPHI predicate, evaluate its value assuming the PHI node
	/// in the loop has the value PHIVal. If we can't fold this expression for some
	/// reason, return null.
	static Constant EvaluateExpression(Value V, const Loop *L,
	DenseMap<Instruction , Constant > &Vals,
	const DataLayout &DL,
	const TargetLibraryInfo *TLI) {
	// Convenient constant check, but redundant for recursive calls.
	if (Constant *C = dyn_cast<Constant>(V)) return C;
	Instruction *I = dyn_cast<Instruction>(V);
	if (!I) return nullptr;

	if (Constant *C = Vals.lookup(I)) return C;

	// An instruction inside the loop depends on a value outside the loop that we
	// weren't given a mapping for, or a value such as a call inside the loop.
	if (!canConstantEvolve(I, L)) return nullptr;

	// An unmapped PHI can be due to a branch or another loop inside this loop,
	// or due to this not being the initial iteration through a loop where we
	// couldn't compute the evolution of this particular PHI last time.
	if (isa<PHINode>(I)) return nullptr;

	std::vector<Constant*> Operands(I->getNumOperands());

	for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
	Instruction *Operand = dyn_cast<Instruction>(I->getOperand(i));
	if (!Operand) {
	Operands[i] = dyn_cast<Constant>(I->getOperand(i));
	if (!Operands[i]) return nullptr;
	continue;
	}
	Constant *C = EvaluateExpression(Operand, L, Vals, DL, TLI);
	Vals[Operand] = C;
	if (!C) return nullptr;
	Operands[i] = C;
	}

	if (CmpInst *CI = dyn_cast<CmpInst>(I))
	return ConstantFoldCompareInstOperands(CI->getPredicate(), Operands[0],
	Operands[1], DL, TLI);
	if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
	if (!LI->isVolatile())
	return ConstantFoldLoadFromConstPtr(Operands[0], LI->getType(), DL);
	}
	return ConstantFoldInstOperands(I, Operands, DL, TLI);
	}


	// If every incoming value to PN except the one for BB is a specific Constant,
	// return that, else return nullptr.
	static Constant getOtherIncomingValue(PHINode PN, BasicBlock *BB) {
	Constant *IncomingVal = nullptr;

	for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
	if (PN->getIncomingBlock(i) == BB)
	continue;

	auto *CurrentVal = dyn_cast<Constant>(PN->getIncomingValue(i));
	if (!CurrentVal)
	return nullptr;

	if (IncomingVal != CurrentVal) {
	if (IncomingVal)
	return nullptr;
	IncomingVal = CurrentVal;
	}
	}

	return IncomingVal;
	}

	/// getConstantEvolutionLoopExitValue - If we know that the specified Phi is
	/// in the header of its containing loop, we know the loop executes a
	/// constant number of times, and the PHI node is just a recurrence
	/// involving constants, fold it.
	Constant *
	ScalarEvolution::getConstantEvolutionLoopExitValue(PHINode *PN,
	const APInt &BEs,
	const Loop *L) {
	auto I = ConstantEvolutionLoopExitValue.find(PN);
	if (I != ConstantEvolutionLoopExitValue.end())
	return I->second;

	if (BEs.ugt(MaxBruteForceIterations))
	return ConstantEvolutionLoopExitValue[PN] = nullptr; // Not going to evaluate it.

	Constant *&RetVal = ConstantEvolutionLoopExitValue[PN];

	DenseMap<Instruction , Constant > CurrentIterVals;
	BasicBlock *Header = L->getHeader();
	assert(PN->getParent() == Header && "Can't evaluate PHI not in loop header!");

	BasicBlock *Latch = L->getLoopLatch();
	if (!Latch)
	return nullptr;

	for (auto &I : *Header) {
	PHINode *PHI = dyn_cast<PHINode>(&I);
	if (!PHI) break;
	auto *StartCST = getOtherIncomingValue(PHI, Latch);
	if (!StartCST) continue;
	CurrentIterVals[PHI] = StartCST;
	}
	if (!CurrentIterVals.count(PN))
	return RetVal = nullptr;

	Value *BEValue = PN->getIncomingValueForBlock(Latch);

	// Execute the loop symbolically to determine the exit value.
	if (BEs.getActiveBits() >= 32)
	return RetVal = nullptr; // More than 2^32-1 iterations?? Not doing it!

	unsigned NumIterations = BEs.getZExtValue(); // must be in range
	unsigned IterationNum = 0;
	const DataLayout &DL = getDataLayout();
	for (; ; ++IterationNum) {
	if (IterationNum == NumIterations)
	return RetVal = CurrentIterVals[PN]; // Got exit value!

	// Compute the value of the PHIs for the next iteration.
	// EvaluateExpression adds non-phi values to the CurrentIterVals map.
	DenseMap<Instruction , Constant > NextIterVals;
	Constant *NextPHI =
	EvaluateExpression(BEValue, L, CurrentIterVals, DL, &TLI);
	if (!NextPHI)
	return nullptr; // Couldn't evaluate!
	NextIterVals[PN] = NextPHI;

	bool StoppedEvolving = NextPHI == CurrentIterVals[PN];

	// Also evaluate the other PHI nodes. However, we don't get to stop if we
	// cease to be able to evaluate one of them or if they stop evolving,
	// because that doesn't necessarily prevent us from computing PN.
	SmallVector<std::pair<PHINode , Constant >, 8> PHIsToCompute;
	for (const auto &I : CurrentIterVals) {
	PHINode *PHI = dyn_cast<PHINode>(I.first);
	if (!PHI \|\| PHI == PN \|\| PHI->getParent() != Header) continue;
	PHIsToCompute.emplace_back(PHI, I.second);
	}
	// We use two distinct loops because EvaluateExpression may invalidate any
	// iterators into CurrentIterVals.
	for (const auto &I : PHIsToCompute) {
	PHINode *PHI = I.first;
	Constant *&NextPHI = NextIterVals[PHI];
	if (!NextPHI) { // Not already computed.
	Value *BEValue = PHI->getIncomingValueForBlock(Latch);
	NextPHI = EvaluateExpression(BEValue, L, CurrentIterVals, DL, &TLI);
	}
	if (NextPHI != I.second)
	StoppedEvolving = false;
	}

	// If all entries in CurrentIterVals == NextIterVals then we can stop
	// iterating, the loop can't continue to change.
	if (StoppedEvolving)
	return RetVal = CurrentIterVals[PN];

	CurrentIterVals.swap(NextIterVals);
	}
	}

	const SCEV ScalarEvolution::computeExitCountExhaustively(const Loop L,
	Value *Cond,
	bool ExitWhen) {
	PHINode *PN = getConstantEvolvingPHI(Cond, L);
	if (!PN) return getCouldNotCompute();

	// If the loop is canonicalized, the PHI will have exactly two entries.
	// That's the only form we support here.
	if (PN->getNumIncomingValues() != 2) return getCouldNotCompute();

	DenseMap<Instruction , Constant > CurrentIterVals;
	BasicBlock *Header = L->getHeader();
	assert(PN->getParent() == Header && "Can't evaluate PHI not in loop header!");

	BasicBlock *Latch = L->getLoopLatch();
	assert(Latch && "Should follow from NumIncomingValues == 2!");

	for (auto &I : *Header) {
	PHINode *PHI = dyn_cast<PHINode>(&I);
	if (!PHI)
	break;
	auto *StartCST = getOtherIncomingValue(PHI, Latch);
	if (!StartCST) continue;
	CurrentIterVals[PHI] = StartCST;
	}
	if (!CurrentIterVals.count(PN))
	return getCouldNotCompute();

	// Okay, we find a PHI node that defines the trip count of this loop. Execute
	// the loop symbolically to determine when the condition gets a value of
	// "ExitWhen".
	unsigned MaxIterations = MaxBruteForceIterations; // Limit analysis.
	const DataLayout &DL = getDataLayout();
	for (unsigned IterationNum = 0; IterationNum != MaxIterations;++IterationNum){
	auto *CondVal = dyn_cast_or_null<ConstantInt>(
	EvaluateExpression(Cond, L, CurrentIterVals, DL, &TLI));

	// Couldn't symbolically evaluate.
	if (!CondVal) return getCouldNotCompute();

	if (CondVal->getValue() == uint64_t(ExitWhen)) {
	++NumBruteForceTripCountsComputed;
	return getConstant(Type::getInt32Ty(getContext()), IterationNum);
	}

	// Update all the PHI nodes for the next iteration.
	DenseMap<Instruction , Constant > NextIterVals;

	// Create a list of which PHIs we need to compute. We want to do this before
	// calling EvaluateExpression on them because that may invalidate iterators
	// into CurrentIterVals.
	SmallVector<PHINode *, 8> PHIsToCompute;
	for (const auto &I : CurrentIterVals) {
	PHINode *PHI = dyn_cast<PHINode>(I.first);
	if (!PHI \|\| PHI->getParent() != Header) continue;
	PHIsToCompute.push_back(PHI);
	}
	for (PHINode *PHI : PHIsToCompute) {
	Constant *&NextPHI = NextIterVals[PHI];
	if (NextPHI) continue; // Already computed!

	Value *BEValue = PHI->getIncomingValueForBlock(Latch);
	NextPHI = EvaluateExpression(BEValue, L, CurrentIterVals, DL, &TLI);
	}
	CurrentIterVals.swap(NextIterVals);
	}

	// Too many iterations were needed to evaluate.
	return getCouldNotCompute();
	}

	const SCEV ScalarEvolution::getSCEVAtScope(const SCEV V, const Loop *L) {
	SmallVector<std::pair<const Loop , const SCEV >, 2> &Values =
	ValuesAtScopes[V];
	// Check to see if we've folded this expression at this loop before.
	for (auto &LS : Values)
	if (LS.first == L)
	return LS.second ? LS.second : V;

	Values.emplace_back(L, nullptr);

	// Otherwise compute it.
	const SCEV *C = computeSCEVAtScope(V, L);
	for (auto &LS : reverse(ValuesAtScopes[V]))
	if (LS.first == L) {
	LS.second = C;
	break;
	}
	return C;
	}

	/// This builds up a Constant using the ConstantExpr interface. That way, we
	/// will return Constants for objects which aren't represented by a
	/// SCEVConstant, because SCEVConstant is restricted to ConstantInt.
	/// Returns NULL if the SCEV isn't representable as a Constant.
	static Constant BuildConstantFromSCEV(const SCEV V) {
	switch (static_cast<SCEVTypes>(V->getSCEVType())) {
	case scCouldNotCompute:
	case scAddRecExpr:
	break;
	case scConstant:
	return cast<SCEVConstant>(V)->getValue();
	case scUnknown:
	return dyn_cast<Constant>(cast<SCEVUnknown>(V)->getValue());
	case scSignExtend: {
	const SCEVSignExtendExpr *SS = cast<SCEVSignExtendExpr>(V);
	if (Constant *CastOp = BuildConstantFromSCEV(SS->getOperand()))
	return ConstantExpr::getSExt(CastOp, SS->getType());
	break;
	}
	case scZeroExtend: {
	const SCEVZeroExtendExpr *SZ = cast<SCEVZeroExtendExpr>(V);
	if (Constant *CastOp = BuildConstantFromSCEV(SZ->getOperand()))
	return ConstantExpr::getZExt(CastOp, SZ->getType());
	break;
	}
	case scTruncate: {
	const SCEVTruncateExpr *ST = cast<SCEVTruncateExpr>(V);
	if (Constant *CastOp = BuildConstantFromSCEV(ST->getOperand()))
	return ConstantExpr::getTrunc(CastOp, ST->getType());
	break;
	}
	case scAddExpr: {
	const SCEVAddExpr *SA = cast<SCEVAddExpr>(V);
	if (Constant *C = BuildConstantFromSCEV(SA->getOperand(0))) {
	if (PointerType *PTy = dyn_cast<PointerType>(C->getType())) {
	unsigned AS = PTy->getAddressSpace();
	Type *DestPtrTy = Type::getInt8PtrTy(C->getContext(), AS);
	C = ConstantExpr::getBitCast(C, DestPtrTy);
	}
	for (unsigned i = 1, e = SA->getNumOperands(); i != e; ++i) {
	Constant *C2 = BuildConstantFromSCEV(SA->getOperand(i));
	if (!C2) return nullptr;

	// First pointer!
	if (!C->getType()->isPointerTy() && C2->getType()->isPointerTy()) {
	unsigned AS = C2->getType()->getPointerAddressSpace();
	std::swap(C, C2);
	Type *DestPtrTy = Type::getInt8PtrTy(C->getContext(), AS);
	// The offsets have been converted to bytes. We can add bytes to an
	// i8* by GEP with the byte count in the first index.
	C = ConstantExpr::getBitCast(C, DestPtrTy);
	}

	// Don't bother trying to sum two pointers. We probably can't
	// statically compute a load that results from it anyway.
	if (C2->getType()->isPointerTy())
	return nullptr;

	if (PointerType *PTy = dyn_cast<PointerType>(C->getType())) {
	if (PTy->getElementType()->isStructTy())
	C2 = ConstantExpr::getIntegerCast(
	C2, Type::getInt32Ty(C->getContext()), true);
	C = ConstantExpr::getGetElementPtr(PTy->getElementType(), C, C2);
	} else
	C = ConstantExpr::getAdd(C, C2);
	}
	return C;
	}
	break;
	}
	case scMulExpr: {
	const SCEVMulExpr *SM = cast<SCEVMulExpr>(V);
	if (Constant *C = BuildConstantFromSCEV(SM->getOperand(0))) {
	// Don't bother with pointers at all.
	if (C->getType()->isPointerTy()) return nullptr;
	for (unsigned i = 1, e = SM->getNumOperands(); i != e; ++i) {
	Constant *C2 = BuildConstantFromSCEV(SM->getOperand(i));
	if (!C2 \|\| C2->getType()->isPointerTy()) return nullptr;
	C = ConstantExpr::getMul(C, C2);
	}
	return C;
	}
	break;
	}
	case scUDivExpr: {
	const SCEVUDivExpr *SU = cast<SCEVUDivExpr>(V);
	if (Constant *LHS = BuildConstantFromSCEV(SU->getLHS()))
	if (Constant *RHS = BuildConstantFromSCEV(SU->getRHS()))
	if (LHS->getType() == RHS->getType())
	return ConstantExpr::getUDiv(LHS, RHS);
	break;
	}
	case scSMaxExpr:
	case scUMaxExpr:
	break; // TODO: smax, umax.
	}
	return nullptr;
	}

	const SCEV ScalarEvolution::computeSCEVAtScope(const SCEV V, const Loop *L) {
	if (isa<SCEVConstant>(V)) return V;

	// If this instruction is evolved from a constant-evolving PHI, compute the
	// exit value from the loop without using SCEVs.
	if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(V)) {
	if (Instruction *I = dyn_cast<Instruction>(SU->getValue())) {
	const Loop *LI = this->LI[I->getParent()];
	if (LI && LI->getParentLoop() == L) // Looking for loop exit value.
	if (PHINode *PN = dyn_cast<PHINode>(I))
	if (PN->getParent() == LI->getHeader()) {
	// Okay, there is no closed form solution for the PHI node. Check
	// to see if the loop that contains it has a known backedge-taken
	// count. If so, we may be able to force computation of the exit
	// value.
	const SCEV *BackedgeTakenCount = getBackedgeTakenCount(LI);
	if (const SCEVConstant *BTCC =
	dyn_cast<SCEVConstant>(BackedgeTakenCount)) {
	+
	+ // This trivial case can show up in some degenerate cases where
	+ // the incoming IR has not yet been fully simplified.
	+ if (BTCC->getValue()->isZero()) {
	+ Value *InitValue = nullptr;
	+ bool MultipleInitValues = false;
	+ for (unsigned i = 0; i < PN->getNumIncomingValues(); i++) {
	+ if (!LI->contains(PN->getIncomingBlock(i))) {
	+ if (!InitValue)
	+ InitValue = PN->getIncomingValue(i);
	+ else if (InitValue != PN->getIncomingValue(i)) {
	+ MultipleInitValues = true;
	+ break;
	+ }
	+ }
	+ if (!MultipleInitValues && InitValue)
	+ return getSCEV(InitValue);
	+ }
	+ }
	// Okay, we know how many times the containing loop executes. If
	// this is a constant evolving PHI node, get the final value at
	// the specified iteration number.
	Constant *RV =
	getConstantEvolutionLoopExitValue(PN, BTCC->getAPInt(), LI);
	if (RV) return getSCEV(RV);
	}
	}

	// Okay, this is an expression that we cannot symbolically evaluate
	// into a SCEV. Check to see if it's possible to symbolically evaluate
	// the arguments into constants, and if so, try to constant propagate the
	// result. This is particularly useful for computing loop exit values.
	if (CanConstantFold(I)) {
	SmallVector<Constant *, 4> Operands;
	bool MadeImprovement = false;
	for (Value *Op : I->operands()) {
	if (Constant *C = dyn_cast<Constant>(Op)) {
	Operands.push_back(C);
	continue;
	}

	// If any of the operands is non-constant and if they are
	// non-integer and non-pointer, don't even try to analyze them
	// with scev techniques.
	if (!isSCEVable(Op->getType()))
	return V;

	const SCEV *OrigV = getSCEV(Op);
	const SCEV *OpV = getSCEVAtScope(OrigV, L);
	MadeImprovement \|= OrigV != OpV;

	Constant *C = BuildConstantFromSCEV(OpV);
	if (!C) return V;
	if (C->getType() != Op->getType())
	C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false,
	Op->getType(),
	false),
	C, Op->getType());
	Operands.push_back(C);
	}

	// Check to see if getSCEVAtScope actually made an improvement.
	if (MadeImprovement) {
	Constant *C = nullptr;
	const DataLayout &DL = getDataLayout();
	if (const CmpInst *CI = dyn_cast<CmpInst>(I))
	C = ConstantFoldCompareInstOperands(CI->getPredicate(), Operands[0],
	Operands[1], DL, &TLI);
	else if (const LoadInst *LI = dyn_cast<LoadInst>(I)) {
	if (!LI->isVolatile())
	C = ConstantFoldLoadFromConstPtr(Operands[0], LI->getType(), DL);
	} else
	C = ConstantFoldInstOperands(I, Operands, DL, &TLI);
	if (!C) return V;
	return getSCEV(C);
	}
	}
	}

	// This is some other type of SCEVUnknown, just return it.
	return V;
	}

	if (const SCEVCommutativeExpr *Comm = dyn_cast<SCEVCommutativeExpr>(V)) {
	// Avoid performing the look-up in the common case where the specified
	// expression has no loop-variant portions.
	for (unsigned i = 0, e = Comm->getNumOperands(); i != e; ++i) {
	const SCEV *OpAtScope = getSCEVAtScope(Comm->getOperand(i), L);
	if (OpAtScope != Comm->getOperand(i)) {
	// Okay, at least one of these operands is loop variant but might be
	// foldable. Build a new instance of the folded commutative expression.
	SmallVector<const SCEV *, 8> NewOps(Comm->op_begin(),
	Comm->op_begin()+i);
	NewOps.push_back(OpAtScope);

	for (++i; i != e; ++i) {
	OpAtScope = getSCEVAtScope(Comm->getOperand(i), L);
	NewOps.push_back(OpAtScope);
	}
	if (isa<SCEVAddExpr>(Comm))
	return getAddExpr(NewOps);
	if (isa<SCEVMulExpr>(Comm))
	return getMulExpr(NewOps);
	if (isa<SCEVSMaxExpr>(Comm))
	return getSMaxExpr(NewOps);
	if (isa<SCEVUMaxExpr>(Comm))
	return getUMaxExpr(NewOps);
	llvm_unreachable("Unknown commutative SCEV type!");
	}
	}
	// If we got here, all operands are loop invariant.
	return Comm;
	}

	if (const SCEVUDivExpr *Div = dyn_cast<SCEVUDivExpr>(V)) {
	const SCEV *LHS = getSCEVAtScope(Div->getLHS(), L);
	const SCEV *RHS = getSCEVAtScope(Div->getRHS(), L);
	if (LHS == Div->getLHS() && RHS == Div->getRHS())
	return Div; // must be loop invariant
	return getUDivExpr(LHS, RHS);
	}

	// If this is a loop recurrence for a loop that does not contain L, then we
	// are dealing with the final value computed by the loop.
	if (const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(V)) {
	// First, attempt to evaluate each operand.
	// Avoid performing the look-up in the common case where the specified
	// expression has no loop-variant portions.
	for (unsigned i = 0, e = AddRec->getNumOperands(); i != e; ++i) {
	const SCEV *OpAtScope = getSCEVAtScope(AddRec->getOperand(i), L);
	if (OpAtScope == AddRec->getOperand(i))
	continue;

	// Okay, at least one of these operands is loop variant but might be
	// foldable. Build a new instance of the folded commutative expression.
	SmallVector<const SCEV *, 8> NewOps(AddRec->op_begin(),
	AddRec->op_begin()+i);
	NewOps.push_back(OpAtScope);
	for (++i; i != e; ++i)
	NewOps.push_back(getSCEVAtScope(AddRec->getOperand(i), L));

	const SCEV *FoldedRec =
	getAddRecExpr(NewOps, AddRec->getLoop(),
	AddRec->getNoWrapFlags(SCEV::FlagNW));
	AddRec = dyn_cast<SCEVAddRecExpr>(FoldedRec);
	// The addrec may be folded to a nonrecurrence, for example, if the
	// induction variable is multiplied by zero after constant folding. Go
	// ahead and return the folded value.
	if (!AddRec)
	return FoldedRec;
	break;
	}

	// If the scope is outside the addrec's loop, evaluate it by using the
	// loop exit value of the addrec.
	if (!AddRec->getLoop()->contains(L)) {
	// To evaluate this recurrence, we need to know how many times the AddRec
	// loop iterates. Compute this now.
	const SCEV *BackedgeTakenCount = getBackedgeTakenCount(AddRec->getLoop());
	if (BackedgeTakenCount == getCouldNotCompute()) return AddRec;

	// Then, evaluate the AddRec.
	return AddRec->evaluateAtIteration(BackedgeTakenCount, *this);
	}

	return AddRec;
	}

	if (const SCEVZeroExtendExpr *Cast = dyn_cast<SCEVZeroExtendExpr>(V)) {
	const SCEV *Op = getSCEVAtScope(Cast->getOperand(), L);
	if (Op == Cast->getOperand())
	return Cast; // must be loop invariant
	return getZeroExtendExpr(Op, Cast->getType());
	}

	if (const SCEVSignExtendExpr *Cast = dyn_cast<SCEVSignExtendExpr>(V)) {
	const SCEV *Op = getSCEVAtScope(Cast->getOperand(), L);
	if (Op == Cast->getOperand())
	return Cast; // must be loop invariant
	return getSignExtendExpr(Op, Cast->getType());
	}

	if (const SCEVTruncateExpr *Cast = dyn_cast<SCEVTruncateExpr>(V)) {
	const SCEV *Op = getSCEVAtScope(Cast->getOperand(), L);
	if (Op == Cast->getOperand())
	return Cast; // must be loop invariant
	return getTruncateExpr(Op, Cast->getType());
	}

	llvm_unreachable("Unknown SCEV type!");
	}

	const SCEV ScalarEvolution::getSCEVAtScope(Value V, const Loop *L) {
	return getSCEVAtScope(getSCEV(V), L);
	}

	/// Finds the minimum unsigned root of the following equation:
	///
	/// A * X = B (mod N)
	///
	/// where N = 2^BW and BW is the common bit width of A and B. The signedness of
	/// A and B isn't important.
	///
	/// If the equation does not have a solution, SCEVCouldNotCompute is returned.
	static const SCEV SolveLinEquationWithOverflow(const APInt &A, const SCEV B,
	ScalarEvolution &SE) {
	uint32_t BW = A.getBitWidth();
	assert(BW == SE.getTypeSizeInBits(B->getType()));
	assert(A != 0 && "A must be non-zero.");

	// 1. D = gcd(A, N)
	//
	// The gcd of A and N may have only one prime factor: 2. The number of
	// trailing zeros in A is its multiplicity
	uint32_t Mult2 = A.countTrailingZeros();
	// D = 2^Mult2

	// 2. Check if B is divisible by D.
	//
	// B is divisible by D if and only if the multiplicity of prime factor 2 for B
	// is not less than multiplicity of this prime factor for D.
	if (SE.GetMinTrailingZeros(B) < Mult2)
	return SE.getCouldNotCompute();

	// 3. Compute I: the multiplicative inverse of (A / D) in arithmetic
	// modulo (N / D).
	//
	// If D == 1, (N / D) == N == 2^BW, so we need one extra bit to represent
	// (N / D) in general. The inverse itself always fits into BW bits, though,
	// so we immediately truncate it.
	APInt AD = A.lshr(Mult2).zext(BW + 1); // AD = A / D
	APInt Mod(BW + 1, 0);
	Mod.setBit(BW - Mult2); // Mod = N / D
	APInt I = AD.multiplicativeInverse(Mod).trunc(BW);

	// 4. Compute the minimum unsigned root of the equation:
	// I * (B / D) mod (N / D)
	// To simplify the computation, we factor out the divide by D:
	// (I * B mod N) / D
	const SCEV *D = SE.getConstant(APInt::getOneBitSet(BW, Mult2));
	return SE.getUDivExactExpr(SE.getMulExpr(B, SE.getConstant(I)), D);
	}

	/// Find the roots of the quadratic equation for the given quadratic chrec
	/// {L,+,M,+,N}. This returns either the two roots (which might be the same) or
	/// two SCEVCouldNotCompute objects.
	///
	static Optional<std::pair<const SCEVConstant ,const SCEVConstant >>
	SolveQuadraticEquation(const SCEVAddRecExpr *AddRec, ScalarEvolution &SE) {
	assert(AddRec->getNumOperands() == 3 && "This is not a quadratic chrec!");
	const SCEVConstant *LC = dyn_cast<SCEVConstant>(AddRec->getOperand(0));
	const SCEVConstant *MC = dyn_cast<SCEVConstant>(AddRec->getOperand(1));
	const SCEVConstant *NC = dyn_cast<SCEVConstant>(AddRec->getOperand(2));

	// We currently can only solve this if the coefficients are constants.
	if (!LC \|\| !MC \|\| !NC)
	return None;

	uint32_t BitWidth = LC->getAPInt().getBitWidth();
	const APInt &L = LC->getAPInt();
	const APInt &M = MC->getAPInt();
	const APInt &N = NC->getAPInt();
	APInt Two(BitWidth, 2);

	// Convert from chrec coefficients to polynomial coefficients AX^2+BX+C

	// The A coefficient is N/2
	APInt A = N.sdiv(Two);

	// The B coefficient is M-N/2
	APInt B = M;
	B -= A; // A is the same as N/2.

	// The C coefficient is L.
	const APInt& C = L;

	// Compute the B^2-4ac term.
	APInt SqrtTerm = B;
	SqrtTerm *= B;
	SqrtTerm -= 4 * (A * C);

	if (SqrtTerm.isNegative()) {
	// The loop is provably infinite.
	return None;
	}

	// Compute sqrt(B^2-4ac). This is guaranteed to be the nearest
	// integer value or else APInt::sqrt() will assert.
	APInt SqrtVal = SqrtTerm.sqrt();

	// Compute the two solutions for the quadratic formula.
	// The divisions must be performed as signed divisions.
	APInt NegB = -std::move(B);
	APInt TwoA = std::move(A);
	TwoA <<= 1;
	if (TwoA.isNullValue())
	return None;

	LLVMContext &Context = SE.getContext();

	ConstantInt *Solution1 =
	ConstantInt::get(Context, (NegB + SqrtVal).sdiv(TwoA));
	ConstantInt *Solution2 =
	ConstantInt::get(Context, (NegB - SqrtVal).sdiv(TwoA));

	return std::make_pair(cast<SCEVConstant>(SE.getConstant(Solution1)),
	cast<SCEVConstant>(SE.getConstant(Solution2)));
	}

	ScalarEvolution::ExitLimit
	ScalarEvolution::howFarToZero(const SCEV V, const Loop L, bool ControlsExit,
	bool AllowPredicates) {

	// This is only used for loops with a "x != y" exit test. The exit condition
	// is now expressed as a single expression, V = x-y. So the exit test is
	// effectively V != 0. We know and take advantage of the fact that this
	// expression only being used in a comparison by zero context.

	SmallPtrSet<const SCEVPredicate *, 4> Predicates;
	// If the value is a constant
	if (const SCEVConstant *C = dyn_cast<SCEVConstant>(V)) {
	// If the value is already zero, the branch will execute zero times.
	if (C->getValue()->isZero()) return C;
	return getCouldNotCompute(); // Otherwise it will loop infinitely.
	}

	const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(V);
	if (!AddRec && AllowPredicates)
	// Try to make this an AddRec using runtime tests, in the first X
	// iterations of this loop, where X is the SCEV expression found by the
	// algorithm below.
	AddRec = convertSCEVToAddRecWithPredicates(V, L, Predicates);

	if (!AddRec \|\| AddRec->getLoop() != L)
	return getCouldNotCompute();

	// If this is a quadratic (3-term) AddRec {L,+,M,+,N}, find the roots of
	// the quadratic equation to solve it.
	if (AddRec->isQuadratic() && AddRec->getType()->isIntegerTy()) {
	if (auto Roots = SolveQuadraticEquation(AddRec, *this)) {
	const SCEVConstant *R1 = Roots->first;
	const SCEVConstant *R2 = Roots->second;
	// Pick the smallest positive root value.
	if (ConstantInt *CB = dyn_cast<ConstantInt>(ConstantExpr::getICmp(
	CmpInst::ICMP_ULT, R1->getValue(), R2->getValue()))) {
	if (!CB->getZExtValue())
	std::swap(R1, R2); // R1 is the minimum root now.

	// We can only use this value if the chrec ends up with an exact zero
	// value at this index. When solving for "X*X != 5", for example, we
	// should not accept a root of 2.
	const SCEV Val = AddRec->evaluateAtIteration(R1, this);
	if (Val->isZero())
	// We found a quadratic root!
	return ExitLimit(R1, R1, false, Predicates);
	}
	}
	return getCouldNotCompute();
	}

	// Otherwise we can only handle this if it is affine.
	if (!AddRec->isAffine())
	return getCouldNotCompute();

	// If this is an affine expression, the execution count of this branch is
	// the minimum unsigned root of the following equation:
	//
	// Start + Step*N = 0 (mod 2^BW)
	//
	// equivalent to:
	//
	// Step*N = -Start (mod 2^BW)
	//
	// where BW is the common bit width of Start and Step.

	// Get the initial value for the loop.
	const SCEV *Start = getSCEVAtScope(AddRec->getStart(), L->getParentLoop());
	const SCEV *Step = getSCEVAtScope(AddRec->getOperand(1), L->getParentLoop());

	// For now we handle only constant steps.
	//
	// TODO: Handle a nonconstant Step given AddRec<NUW>. If the
	// AddRec is NUW, then (in an unsigned sense) it cannot be counting up to wrap
	// to 0, it must be counting down to equal 0. Consequently, N = Start / -Step.
	// We have not yet seen any such cases.
	const SCEVConstant *StepC = dyn_cast<SCEVConstant>(Step);
	if (!StepC \|\| StepC->getValue()->isZero())
	return getCouldNotCompute();

	// For positive steps (counting up until unsigned overflow):
	// N = -Start/Step (as unsigned)
	// For negative steps (counting down to zero):
	// N = Start/-Step
	// First compute the unsigned distance from zero in the direction of Step.
	bool CountDown = StepC->getAPInt().isNegative();
	const SCEV *Distance = CountDown ? Start : getNegativeSCEV(Start);

	// Handle unitary steps, which cannot wraparound.
	// 1N = -Start; -1N = Start (mod 2^BW), so:
	// N = Distance (as unsigned)
	if (StepC->getValue()->isOne() \|\| StepC->getValue()->isMinusOne()) {
	APInt MaxBECount = getUnsignedRangeMax(Distance);

	// When a loop like "for (int i = 0; i != n; ++i) { /* body */ }" is rotated,
	// we end up with a loop whose backedge-taken count is n - 1. Detect this
	// case, and see if we can improve the bound.
	//
	// Explicitly handling this here is necessary because getUnsignedRange
	// isn't context-sensitive; it doesn't know that we only care about the
	// range inside the loop.
	const SCEV *Zero = getZero(Distance->getType());
	const SCEV *One = getOne(Distance->getType());
	const SCEV *DistancePlusOne = getAddExpr(Distance, One);
	if (isLoopEntryGuardedByCond(L, ICmpInst::ICMP_NE, DistancePlusOne, Zero)) {
	// If Distance + 1 doesn't overflow, we can compute the maximum distance
	// as "unsigned_max(Distance + 1) - 1".
	ConstantRange CR = getUnsignedRange(DistancePlusOne);
	MaxBECount = APIntOps::umin(MaxBECount, CR.getUnsignedMax() - 1);
	}
	return ExitLimit(Distance, getConstant(MaxBECount), false, Predicates);
	}

	// If the condition controls loop exit (the loop exits only if the expression
	// is true) and the addition is no-wrap we can use unsigned divide to
	// compute the backedge count. In this case, the step may not divide the
	// distance, but we don't care because if the condition is "missed" the loop
	// will have undefined behavior due to wrapping.
	if (ControlsExit && AddRec->hasNoSelfWrap() &&
	loopHasNoAbnormalExits(AddRec->getLoop())) {
	const SCEV *Exact =
	getUDivExpr(Distance, CountDown ? getNegativeSCEV(Step) : Step);
	const SCEV *Max =
	Exact == getCouldNotCompute()
	? Exact
	: getConstant(getUnsignedRangeMax(Exact));
	return ExitLimit(Exact, Max, false, Predicates);
	}

	// Solve the general equation.
	const SCEV *E = SolveLinEquationWithOverflow(StepC->getAPInt(),
	getNegativeSCEV(Start), *this);
	const SCEV *M = E == getCouldNotCompute()
	? E
	: getConstant(getUnsignedRangeMax(E));
	return ExitLimit(E, M, false, Predicates);
	}

	ScalarEvolution::ExitLimit
	ScalarEvolution::howFarToNonZero(const SCEV V, const Loop L) {
	// Loops that look like: while (X == 0) are very strange indeed. We don't
	// handle them yet except for the trivial case. This could be expanded in the
	// future as needed.

	// If the value is a constant, check to see if it is known to be non-zero
	// already. If so, the backedge will execute zero times.
	if (const SCEVConstant *C = dyn_cast<SCEVConstant>(V)) {
	if (!C->getValue()->isZero())
	return getZero(C->getType());
	return getCouldNotCompute(); // Otherwise it will loop infinitely.
	}

	// We could implement others, but I really doubt anyone writes loops like
	// this, and if they did, they would already be constant folded.
	return getCouldNotCompute();
	}

	std::pair<BasicBlock , BasicBlock >
	ScalarEvolution::getPredecessorWithUniqueSuccessorForBB(BasicBlock *BB) {
	// If the block has a unique predecessor, then there is no path from the
	// predecessor to the block that does not go through the direct edge
	// from the predecessor to the block.
	if (BasicBlock *Pred = BB->getSinglePredecessor())
	return {Pred, BB};

	// A loop's header is defined to be a block that dominates the loop.
	// If the header has a unique predecessor outside the loop, it must be
	// a block that has exactly one successor that can reach the loop.
	if (Loop *L = LI.getLoopFor(BB))
	return {L->getLoopPredecessor(), L->getHeader()};

	return {nullptr, nullptr};
	}

	/// SCEV structural equivalence is usually sufficient for testing whether two
	/// expressions are equal, however for the purposes of looking for a condition
	/// guarding a loop, it can be useful to be a little more general, since a
	/// front-end may have replicated the controlling expression.
	///
	static bool HasSameValue(const SCEV A, const SCEV B) {
	// Quick check to see if they are the same SCEV.
	if (A == B) return true;

	auto ComputesEqualValues = [](const Instruction A, const Instruction B) {
	// Not all instructions that are "identical" compute the same value. For
	// instance, two distinct alloca instructions allocating the same type are
	// identical and do not read memory; but compute distinct values.
	return A->isIdenticalTo(B) && (isa<BinaryOperator>(A) \|\| isa<GetElementPtrInst>(A));
	};

	// Otherwise, if they're both SCEVUnknown, it's possible that they hold
	// two different instructions with the same value. Check for this case.
	if (const SCEVUnknown *AU = dyn_cast<SCEVUnknown>(A))
	if (const SCEVUnknown *BU = dyn_cast<SCEVUnknown>(B))
	if (const Instruction *AI = dyn_cast<Instruction>(AU->getValue()))
	if (const Instruction *BI = dyn_cast<Instruction>(BU->getValue()))
	if (ComputesEqualValues(AI, BI))
	return true;

	// Otherwise assume they may have a different value.
	return false;
	}

	bool ScalarEvolution::SimplifyICmpOperands(ICmpInst::Predicate &Pred,
	const SCEV &LHS, const SCEV &RHS,
	unsigned Depth) {
	bool Changed = false;

	// If we hit the max recursion limit bail out.
	if (Depth >= 3)
	return false;

	// Canonicalize a constant to the right side.
	if (const SCEVConstant *LHSC = dyn_cast<SCEVConstant>(LHS)) {
	// Check for both operands constant.
	if (const SCEVConstant *RHSC = dyn_cast<SCEVConstant>(RHS)) {
	if (ConstantExpr::getICmp(Pred,
	LHSC->getValue(),
	RHSC->getValue())->isNullValue())
	goto trivially_false;
	else
	goto trivially_true;
	}
	// Otherwise swap the operands to put the constant on the right.
	std::swap(LHS, RHS);
	Pred = ICmpInst::getSwappedPredicate(Pred);
	Changed = true;
	}

	// If we're comparing an addrec with a value which is loop-invariant in the
	// addrec's loop, put the addrec on the left. Also make a dominance check,
	// as both operands could be addrecs loop-invariant in each other's loop.
	if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(RHS)) {
	const Loop *L = AR->getLoop();
	if (isLoopInvariant(LHS, L) && properlyDominates(LHS, L->getHeader())) {
	std::swap(LHS, RHS);
	Pred = ICmpInst::getSwappedPredicate(Pred);
	Changed = true;
	}
	}

	// If there's a constant operand, canonicalize comparisons with boundary
	// cases, and canonicalize *-or-equal comparisons to regular comparisons.
	if (const SCEVConstant *RC = dyn_cast<SCEVConstant>(RHS)) {
	const APInt &RA = RC->getAPInt();

	bool SimplifiedByConstantRange = false;

	if (!ICmpInst::isEquality(Pred)) {
	ConstantRange ExactCR = ConstantRange::makeExactICmpRegion(Pred, RA);
	if (ExactCR.isFullSet())
	goto trivially_true;
	else if (ExactCR.isEmptySet())
	goto trivially_false;

	APInt NewRHS;
	CmpInst::Predicate NewPred;
	if (ExactCR.getEquivalentICmp(NewPred, NewRHS) &&
	ICmpInst::isEquality(NewPred)) {
	// We were able to convert an inequality to an equality.
	Pred = NewPred;
	RHS = getConstant(NewRHS);
	Changed = SimplifiedByConstantRange = true;
	}
	}

	if (!SimplifiedByConstantRange) {
	switch (Pred) {
	default:
	break;
	case ICmpInst::ICMP_EQ:
	case ICmpInst::ICMP_NE:
	// Fold ((-1) * %a) + %b == 0 (equivalent to %b-%a == 0) into %a == %b.
	if (!RA)
	if (const SCEVAddExpr *AE = dyn_cast<SCEVAddExpr>(LHS))
	if (const SCEVMulExpr *ME =
	dyn_cast<SCEVMulExpr>(AE->getOperand(0)))
	if (AE->getNumOperands() == 2 && ME->getNumOperands() == 2 &&
	ME->getOperand(0)->isAllOnesValue()) {
	RHS = AE->getOperand(1);
	LHS = ME->getOperand(1);
	Changed = true;
	}
	break;


	// The "Should have been caught earlier!" messages refer to the fact
	// that the ExactCR.isFullSet() or ExactCR.isEmptySet() check above
	// should have fired on the corresponding cases, and canonicalized the
	// check to trivially_true or trivially_false.

	case ICmpInst::ICMP_UGE:
	assert(!RA.isMinValue() && "Should have been caught earlier!");
	Pred = ICmpInst::ICMP_UGT;
	RHS = getConstant(RA - 1);
	Changed = true;
	break;
	case ICmpInst::ICMP_ULE:
	assert(!RA.isMaxValue() && "Should have been caught earlier!");
	Pred = ICmpInst::ICMP_ULT;
	RHS = getConstant(RA + 1);
	Changed = true;
	break;
	case ICmpInst::ICMP_SGE:
	assert(!RA.isMinSignedValue() && "Should have been caught earlier!");
	Pred = ICmpInst::ICMP_SGT;
	RHS = getConstant(RA - 1);
	Changed = true;
	break;
	case ICmpInst::ICMP_SLE:
	assert(!RA.isMaxSignedValue() && "Should have been caught earlier!");
	Pred = ICmpInst::ICMP_SLT;
	RHS = getConstant(RA + 1);
	Changed = true;
	break;
	}
	}
	}

	// Check for obvious equality.
	if (HasSameValue(LHS, RHS)) {
	if (ICmpInst::isTrueWhenEqual(Pred))
	goto trivially_true;
	if (ICmpInst::isFalseWhenEqual(Pred))
	goto trivially_false;
	}

	// If possible, canonicalize GE/LE comparisons to GT/LT comparisons, by
	// adding or subtracting 1 from one of the operands.
	switch (Pred) {
	case ICmpInst::ICMP_SLE:
	if (!getSignedRangeMax(RHS).isMaxSignedValue()) {
	RHS = getAddExpr(getConstant(RHS->getType(), 1, true), RHS,
	SCEV::FlagNSW);
	Pred = ICmpInst::ICMP_SLT;
	Changed = true;
	} else if (!getSignedRangeMin(LHS).isMinSignedValue()) {
	LHS = getAddExpr(getConstant(RHS->getType(), (uint64_t)-1, true), LHS,
	SCEV::FlagNSW);
	Pred = ICmpInst::ICMP_SLT;
	Changed = true;
	}
	break;
	case ICmpInst::ICMP_SGE:
	if (!getSignedRangeMin(RHS).isMinSignedValue()) {
	RHS = getAddExpr(getConstant(RHS->getType(), (uint64_t)-1, true), RHS,
	SCEV::FlagNSW);
	Pred = ICmpInst::ICMP_SGT;
	Changed = true;
	} else if (!getSignedRangeMax(LHS).isMaxSignedValue()) {
	LHS = getAddExpr(getConstant(RHS->getType(), 1, true), LHS,
	SCEV::FlagNSW);
	Pred = ICmpInst::ICMP_SGT;
	Changed = true;
	}
	break;
	case ICmpInst::ICMP_ULE:
	if (!getUnsignedRangeMax(RHS).isMaxValue()) {
	RHS = getAddExpr(getConstant(RHS->getType(), 1, true), RHS,
	SCEV::FlagNUW);
	Pred = ICmpInst::ICMP_ULT;
	Changed = true;
	} else if (!getUnsignedRangeMin(LHS).isMinValue()) {
	LHS = getAddExpr(getConstant(RHS->getType(), (uint64_t)-1, true), LHS);
	Pred = ICmpInst::ICMP_ULT;
	Changed = true;
	}
	break;
	case ICmpInst::ICMP_UGE:
	if (!getUnsignedRangeMin(RHS).isMinValue()) {
	RHS = getAddExpr(getConstant(RHS->getType(), (uint64_t)-1, true), RHS);
	Pred = ICmpInst::ICMP_UGT;
	Changed = true;
	} else if (!getUnsignedRangeMax(LHS).isMaxValue()) {
	LHS = getAddExpr(getConstant(RHS->getType(), 1, true), LHS,
	SCEV::FlagNUW);
	Pred = ICmpInst::ICMP_UGT;
	Changed = true;
	}
	break;
	default:
	break;
	}

	// TODO: More simplifications are possible here.

	// Recursively simplify until we either hit a recursion limit or nothing
	// changes.
	if (Changed)
	return SimplifyICmpOperands(Pred, LHS, RHS, Depth+1);

	return Changed;

	trivially_true:
	// Return 0 == 0.
	LHS = RHS = getConstant(ConstantInt::getFalse(getContext()));
	Pred = ICmpInst::ICMP_EQ;
	return true;

	trivially_false:
	// Return 0 != 0.
	LHS = RHS = getConstant(ConstantInt::getFalse(getContext()));
	Pred = ICmpInst::ICMP_NE;
	return true;
	}

	bool ScalarEvolution::isKnownNegative(const SCEV *S) {
	return getSignedRangeMax(S).isNegative();
	}

	bool ScalarEvolution::isKnownPositive(const SCEV *S) {
	return getSignedRangeMin(S).isStrictlyPositive();
	}

	bool ScalarEvolution::isKnownNonNegative(const SCEV *S) {
	return !getSignedRangeMin(S).isNegative();
	}

	bool ScalarEvolution::isKnownNonPositive(const SCEV *S) {
	return !getSignedRangeMax(S).isStrictlyPositive();
	}

	bool ScalarEvolution::isKnownNonZero(const SCEV *S) {
	return isKnownNegative(S) \|\| isKnownPositive(S);
	}

	bool ScalarEvolution::isKnownPredicate(ICmpInst::Predicate Pred,
	const SCEV LHS, const SCEV RHS) {
	// Canonicalize the inputs first.
	(void)SimplifyICmpOperands(Pred, LHS, RHS);

	// If LHS or RHS is an addrec, check to see if the condition is true in
	// every iteration of the loop.
	// If LHS and RHS are both addrec, both conditions must be true in
	// every iteration of the loop.
	const SCEVAddRecExpr *LAR = dyn_cast<SCEVAddRecExpr>(LHS);
	const SCEVAddRecExpr *RAR = dyn_cast<SCEVAddRecExpr>(RHS);
	bool LeftGuarded = false;
	bool RightGuarded = false;
	if (LAR) {
	const Loop *L = LAR->getLoop();
	if (isLoopEntryGuardedByCond(L, Pred, LAR->getStart(), RHS) &&
	isLoopBackedgeGuardedByCond(L, Pred, LAR->getPostIncExpr(*this), RHS)) {
	if (!RAR) return true;
	LeftGuarded = true;
	}
	}
	if (RAR) {
	const Loop *L = RAR->getLoop();
	if (isLoopEntryGuardedByCond(L, Pred, LHS, RAR->getStart()) &&
	isLoopBackedgeGuardedByCond(L, Pred, LHS, RAR->getPostIncExpr(*this))) {
	if (!LAR) return true;
	RightGuarded = true;
	}
	}
	if (LeftGuarded && RightGuarded)
	return true;

	if (isKnownPredicateViaSplitting(Pred, LHS, RHS))
	return true;

	// Otherwise see what can be done with known constant ranges.
	return isKnownPredicateViaConstantRanges(Pred, LHS, RHS);
	}

	bool ScalarEvolution::isMonotonicPredicate(const SCEVAddRecExpr *LHS,
	ICmpInst::Predicate Pred,
	bool &Increasing) {
	bool Result = isMonotonicPredicateImpl(LHS, Pred, Increasing);

	#ifndef NDEBUG
	// Verify an invariant: inverting the predicate should turn a monotonically
	// increasing change to a monotonically decreasing one, and vice versa.
	bool IncreasingSwapped;
	bool ResultSwapped = isMonotonicPredicateImpl(
	LHS, ICmpInst::getSwappedPredicate(Pred), IncreasingSwapped);

	assert(Result == ResultSwapped && "should be able to analyze both!");
	if (ResultSwapped)
	assert(Increasing == !IncreasingSwapped &&
	"monotonicity should flip as we flip the predicate");
	#endif

	return Result;
	}

	bool ScalarEvolution::isMonotonicPredicateImpl(const SCEVAddRecExpr *LHS,
	ICmpInst::Predicate Pred,
	bool &Increasing) {

	// A zero step value for LHS means the induction variable is essentially a
	// loop invariant value. We don't really depend on the predicate actually
	// flipping from false to true (for increasing predicates, and the other way
	// around for decreasing predicates), all we care about is that if the
	// predicate changes then it only changes from false to true.
	//
	// A zero step value in itself is not very useful, but there may be places
	// where SCEV can prove X >= 0 but not prove X > 0, so it is helpful to be
	// as general as possible.

	switch (Pred) {
	default:
	return false; // Conservative answer

	case ICmpInst::ICMP_UGT:
	case ICmpInst::ICMP_UGE:
	case ICmpInst::ICMP_ULT:
	case ICmpInst::ICMP_ULE:
	if (!LHS->hasNoUnsignedWrap())
	return false;

	Increasing = Pred == ICmpInst::ICMP_UGT \|\| Pred == ICmpInst::ICMP_UGE;
	return true;

	case ICmpInst::ICMP_SGT:
	case ICmpInst::ICMP_SGE:
	case ICmpInst::ICMP_SLT:
	case ICmpInst::ICMP_SLE: {
	if (!LHS->hasNoSignedWrap())
	return false;

	const SCEV Step = LHS->getStepRecurrence(this);

	if (isKnownNonNegative(Step)) {
	Increasing = Pred == ICmpInst::ICMP_SGT \|\| Pred == ICmpInst::ICMP_SGE;
	return true;
	}

	if (isKnownNonPositive(Step)) {
	Increasing = Pred == ICmpInst::ICMP_SLT \|\| Pred == ICmpInst::ICMP_SLE;
	return true;
	}

	return false;
	}

	}

	llvm_unreachable("switch has default clause!");
	}

	bool ScalarEvolution::isLoopInvariantPredicate(
	ICmpInst::Predicate Pred, const SCEV LHS, const SCEV RHS, const Loop *L,
	ICmpInst::Predicate &InvariantPred, const SCEV *&InvariantLHS,
	const SCEV *&InvariantRHS) {

	// If there is a loop-invariant, force it into the RHS, otherwise bail out.
	if (!isLoopInvariant(RHS, L)) {
	if (!isLoopInvariant(LHS, L))
	return false;

	std::swap(LHS, RHS);
	Pred = ICmpInst::getSwappedPredicate(Pred);
	}

	const SCEVAddRecExpr *ArLHS = dyn_cast<SCEVAddRecExpr>(LHS);
	if (!ArLHS \|\| ArLHS->getLoop() != L)
	return false;

	bool Increasing;
	if (!isMonotonicPredicate(ArLHS, Pred, Increasing))
	return false;

	// If the predicate "ArLHS `Pred` RHS" monotonically increases from false to
	// true as the loop iterates, and the backedge is control dependent on
	// "ArLHS `Pred` RHS" == true then we can reason as follows:
	//
	// * if the predicate was false in the first iteration then the predicate
	// is never evaluated again, since the loop exits without taking the
	// backedge.
	// * if the predicate was true in the first iteration then it will
	// continue to be true for all future iterations since it is
	// monotonically increasing.
	//
	// For both the above possibilities, we can replace the loop varying
	// predicate with its value on the first iteration of the loop (which is
	// loop invariant).
	//
	// A similar reasoning applies for a monotonically decreasing predicate, by
	// replacing true with false and false with true in the above two bullets.

	auto P = Increasing ? Pred : ICmpInst::getInversePredicate(Pred);

	if (!isLoopBackedgeGuardedByCond(L, P, LHS, RHS))
	return false;

	InvariantPred = Pred;
	InvariantLHS = ArLHS->getStart();
	InvariantRHS = RHS;
	return true;
	}

	bool ScalarEvolution::isKnownPredicateViaConstantRanges(
	ICmpInst::Predicate Pred, const SCEV LHS, const SCEV RHS) {
	if (HasSameValue(LHS, RHS))
	return ICmpInst::isTrueWhenEqual(Pred);

	// This code is split out from isKnownPredicate because it is called from
	// within isLoopEntryGuardedByCond.

	auto CheckRanges =
	[&](const ConstantRange &RangeLHS, const ConstantRange &RangeRHS) {
	return ConstantRange::makeSatisfyingICmpRegion(Pred, RangeRHS)
	.contains(RangeLHS);
	};

	// The check at the top of the function catches the case where the values are
	// known to be equal.
	if (Pred == CmpInst::ICMP_EQ)
	return false;

	if (Pred == CmpInst::ICMP_NE)
	return CheckRanges(getSignedRange(LHS), getSignedRange(RHS)) \|\|
	CheckRanges(getUnsignedRange(LHS), getUnsignedRange(RHS)) \|\|
	isKnownNonZero(getMinusSCEV(LHS, RHS));

	if (CmpInst::isSigned(Pred))
	return CheckRanges(getSignedRange(LHS), getSignedRange(RHS));

	return CheckRanges(getUnsignedRange(LHS), getUnsignedRange(RHS));
	}

	bool ScalarEvolution::isKnownPredicateViaNoOverflow(ICmpInst::Predicate Pred,
	const SCEV *LHS,
	const SCEV *RHS) {

	// Match Result to (X + Y)<ExpectedFlags> where Y is a constant integer.
	// Return Y via OutY.
	auto MatchBinaryAddToConst =
	[this](const SCEV Result, const SCEV X, APInt &OutY,
	SCEV::NoWrapFlags ExpectedFlags) {
	const SCEV NonConstOp, ConstOp;
	SCEV::NoWrapFlags FlagsPresent;

	if (!splitBinaryAdd(Result, ConstOp, NonConstOp, FlagsPresent) \|\|
	!isa<SCEVConstant>(ConstOp) \|\| NonConstOp != X)
	return false;

	OutY = cast<SCEVConstant>(ConstOp)->getAPInt();
	return (FlagsPresent & ExpectedFlags) == ExpectedFlags;
	};

	APInt C;

	switch (Pred) {
	default:
	break;

	case ICmpInst::ICMP_SGE:
	std::swap(LHS, RHS);
	LLVM_FALLTHROUGH;
	case ICmpInst::ICMP_SLE:
	// X s<= (X + C)<nsw> if C >= 0
	if (MatchBinaryAddToConst(RHS, LHS, C, SCEV::FlagNSW) && C.isNonNegative())
	return true;

	// (X + C)<nsw> s<= X if C <= 0
	if (MatchBinaryAddToConst(LHS, RHS, C, SCEV::FlagNSW) &&
	!C.isStrictlyPositive())
	return true;
	break;

	case ICmpInst::ICMP_SGT:
	std::swap(LHS, RHS);
	LLVM_FALLTHROUGH;
	case ICmpInst::ICMP_SLT:
	// X s< (X + C)<nsw> if C > 0
	if (MatchBinaryAddToConst(RHS, LHS, C, SCEV::FlagNSW) &&
	C.isStrictlyPositive())
	return true;

	// (X + C)<nsw> s< X if C < 0
	if (MatchBinaryAddToConst(LHS, RHS, C, SCEV::FlagNSW) && C.isNegative())
	return true;
	break;
	}

	return false;
	}

	bool ScalarEvolution::isKnownPredicateViaSplitting(ICmpInst::Predicate Pred,
	const SCEV *LHS,
	const SCEV *RHS) {
	if (Pred != ICmpInst::ICMP_ULT \|\| ProvingSplitPredicate)
	return false;

	// Allowing arbitrary number of activations of isKnownPredicateViaSplitting on
	// the stack can result in exponential time complexity.
	SaveAndRestore<bool> Restore(ProvingSplitPredicate, true);

	// If L >= 0 then I `ult` L <=> I >= 0 && I `slt` L
	//
	// To prove L >= 0 we use isKnownNonNegative whereas to prove I >= 0 we use
	// isKnownPredicate. isKnownPredicate is more powerful, but also more
	// expensive; and using isKnownNonNegative(RHS) is sufficient for most of the
	// interesting cases seen in practice. We can consider "upgrading" L >= 0 to
	// use isKnownPredicate later if needed.
	return isKnownNonNegative(RHS) &&
	isKnownPredicate(CmpInst::ICMP_SGE, LHS, getZero(LHS->getType())) &&
	isKnownPredicate(CmpInst::ICMP_SLT, LHS, RHS);
	}

	bool ScalarEvolution::isImpliedViaGuard(BasicBlock *BB,
	ICmpInst::Predicate Pred,
	const SCEV LHS, const SCEV RHS) {
	// No need to even try if we know the module has no guards.
	if (!HasGuards)
	return false;

	return any_of(*BB, [&](Instruction &I) {
	using namespace llvm::PatternMatch;

	Value *Condition;
	return match(&I, m_Intrinsic<Intrinsic::experimental_guard>(
	m_Value(Condition))) &&
	isImpliedCond(Pred, LHS, RHS, Condition, false);
	});
	}

	/// isLoopBackedgeGuardedByCond - Test whether the backedge of the loop is
	/// protected by a conditional between LHS and RHS. This is used to
	/// to eliminate casts.
	bool
	ScalarEvolution::isLoopBackedgeGuardedByCond(const Loop *L,
	ICmpInst::Predicate Pred,
	const SCEV LHS, const SCEV RHS) {
	// Interpret a null as meaning no loop, where there is obviously no guard
	// (interprocedural conditions notwithstanding).
	if (!L) return true;

	if (isKnownPredicateViaConstantRanges(Pred, LHS, RHS))
	return true;

	BasicBlock *Latch = L->getLoopLatch();
	if (!Latch)
	return false;

	BranchInst *LoopContinuePredicate =
	dyn_cast<BranchInst>(Latch->getTerminator());
	if (LoopContinuePredicate && LoopContinuePredicate->isConditional() &&
	isImpliedCond(Pred, LHS, RHS,
	LoopContinuePredicate->getCondition(),
	LoopContinuePredicate->getSuccessor(0) != L->getHeader()))
	return true;

	// We don't want more than one activation of the following loops on the stack
	// -- that can lead to O(n!) time complexity.
	if (WalkingBEDominatingConds)
	return false;

	SaveAndRestore<bool> ClearOnExit(WalkingBEDominatingConds, true);

	// See if we can exploit a trip count to prove the predicate.
	const auto &BETakenInfo = getBackedgeTakenInfo(L);
	const SCEV *LatchBECount = BETakenInfo.getExact(Latch, this);
	if (LatchBECount != getCouldNotCompute()) {
	// We know that Latch branches back to the loop header exactly
	// LatchBECount times. This means the backdege condition at Latch is
	// equivalent to "{0,+,1} u< LatchBECount".
	Type *Ty = LatchBECount->getType();
	auto NoWrapFlags = SCEV::NoWrapFlags(SCEV::FlagNUW \| SCEV::FlagNW);
	const SCEV *LoopCounter =
	getAddRecExpr(getZero(Ty), getOne(Ty), L, NoWrapFlags);
	if (isImpliedCond(Pred, LHS, RHS, ICmpInst::ICMP_ULT, LoopCounter,
	LatchBECount))
	return true;
	}

	// Check conditions due to any @llvm.assume intrinsics.
	for (auto &AssumeVH : AC.assumptions()) {
	if (!AssumeVH)
	continue;
	auto *CI = cast<CallInst>(AssumeVH);
	if (!DT.dominates(CI, Latch->getTerminator()))
	continue;

	if (isImpliedCond(Pred, LHS, RHS, CI->getArgOperand(0), false))
	return true;
	}

	// If the loop is not reachable from the entry block, we risk running into an
	// infinite loop as we walk up into the dom tree. These loops do not matter
	// anyway, so we just return a conservative answer when we see them.
	if (!DT.isReachableFromEntry(L->getHeader()))
	return false;

	if (isImpliedViaGuard(Latch, Pred, LHS, RHS))
	return true;

	for (DomTreeNode DTN = DT[Latch], HeaderDTN = DT[L->getHeader()];
	DTN != HeaderDTN; DTN = DTN->getIDom()) {

	assert(DTN && "should reach the loop header before reaching the root!");

	BasicBlock *BB = DTN->getBlock();
	if (isImpliedViaGuard(BB, Pred, LHS, RHS))
	return true;

	BasicBlock *PBB = BB->getSinglePredecessor();
	if (!PBB)
	continue;

	BranchInst *ContinuePredicate = dyn_cast<BranchInst>(PBB->getTerminator());
	if (!ContinuePredicate \|\| !ContinuePredicate->isConditional())
	continue;

	Value *Condition = ContinuePredicate->getCondition();

	// If we have an edge `E` within the loop body that dominates the only
	// latch, the condition guarding `E` also guards the backedge. This
	// reasoning works only for loops with a single latch.

	BasicBlockEdge DominatingEdge(PBB, BB);
	if (DominatingEdge.isSingleEdge()) {
	// We're constructively (and conservatively) enumerating edges within the
	// loop body that dominate the latch. The dominator tree better agree
	// with us on this:
	assert(DT.dominates(DominatingEdge, Latch) && "should be!");

	if (isImpliedCond(Pred, LHS, RHS, Condition,
	BB != ContinuePredicate->getSuccessor(0)))
	return true;
	}
	}

	return false;
	}

	bool
	ScalarEvolution::isLoopEntryGuardedByCond(const Loop *L,
	ICmpInst::Predicate Pred,
	const SCEV LHS, const SCEV RHS) {
	// Interpret a null as meaning no loop, where there is obviously no guard
	// (interprocedural conditions notwithstanding).
	if (!L) return false;

	if (isKnownPredicateViaConstantRanges(Pred, LHS, RHS))
	return true;

	// Starting at the loop predecessor, climb up the predecessor chain, as long
	// as there are predecessors that can be found that have unique successors
	// leading to the original header.
	for (std::pair<BasicBlock , BasicBlock >
	Pair(L->getLoopPredecessor(), L->getHeader());
	Pair.first;
	Pair = getPredecessorWithUniqueSuccessorForBB(Pair.first)) {

	if (isImpliedViaGuard(Pair.first, Pred, LHS, RHS))
	return true;

	BranchInst *LoopEntryPredicate =
	dyn_cast<BranchInst>(Pair.first->getTerminator());
	if (!LoopEntryPredicate \|\|
	LoopEntryPredicate->isUnconditional())
	continue;

	if (isImpliedCond(Pred, LHS, RHS,
	LoopEntryPredicate->getCondition(),
	LoopEntryPredicate->getSuccessor(0) != Pair.second))
	return true;
	}

	// Check conditions due to any @llvm.assume intrinsics.
	for (auto &AssumeVH : AC.assumptions()) {
	if (!AssumeVH)
	continue;
	auto *CI = cast<CallInst>(AssumeVH);
	if (!DT.dominates(CI, L->getHeader()))
	continue;

	if (isImpliedCond(Pred, LHS, RHS, CI->getArgOperand(0), false))
	return true;
	}

	return false;
	}

	bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred,
	const SCEV LHS, const SCEV RHS,
	Value *FoundCondValue,
	bool Inverse) {
	if (!PendingLoopPredicates.insert(FoundCondValue).second)
	return false;

	auto ClearOnExit =
	make_scope_exit([&]() { PendingLoopPredicates.erase(FoundCondValue); });

	// Recursively handle And and Or conditions.
	if (BinaryOperator *BO = dyn_cast<BinaryOperator>(FoundCondValue)) {
	if (BO->getOpcode() == Instruction::And) {
	if (!Inverse)
	return isImpliedCond(Pred, LHS, RHS, BO->getOperand(0), Inverse) \|\|
	isImpliedCond(Pred, LHS, RHS, BO->getOperand(1), Inverse);
	} else if (BO->getOpcode() == Instruction::Or) {
	if (Inverse)
	return isImpliedCond(Pred, LHS, RHS, BO->getOperand(0), Inverse) \|\|
	isImpliedCond(Pred, LHS, RHS, BO->getOperand(1), Inverse);
	}
	}

	ICmpInst *ICI = dyn_cast<ICmpInst>(FoundCondValue);
	if (!ICI) return false;

	// Now that we found a conditional branch that dominates the loop or controls
	// the loop latch. Check to see if it is the comparison we are looking for.
	ICmpInst::Predicate FoundPred;
	if (Inverse)
	FoundPred = ICI->getInversePredicate();
	else
	FoundPred = ICI->getPredicate();

	const SCEV *FoundLHS = getSCEV(ICI->getOperand(0));
	const SCEV *FoundRHS = getSCEV(ICI->getOperand(1));

	return isImpliedCond(Pred, LHS, RHS, FoundPred, FoundLHS, FoundRHS);
	}

	bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred, const SCEV *LHS,
	const SCEV *RHS,
	ICmpInst::Predicate FoundPred,
	const SCEV *FoundLHS,
	const SCEV *FoundRHS) {
	// Balance the types.
	if (getTypeSizeInBits(LHS->getType()) <
	getTypeSizeInBits(FoundLHS->getType())) {
	if (CmpInst::isSigned(Pred)) {
	LHS = getSignExtendExpr(LHS, FoundLHS->getType());
	RHS = getSignExtendExpr(RHS, FoundLHS->getType());
	} else {
	LHS = getZeroExtendExpr(LHS, FoundLHS->getType());
	RHS = getZeroExtendExpr(RHS, FoundLHS->getType());
	}
	} else if (getTypeSizeInBits(LHS->getType()) >
	getTypeSizeInBits(FoundLHS->getType())) {
	if (CmpInst::isSigned(FoundPred)) {
	FoundLHS = getSignExtendExpr(FoundLHS, LHS->getType());
	FoundRHS = getSignExtendExpr(FoundRHS, LHS->getType());
	} else {
	FoundLHS = getZeroExtendExpr(FoundLHS, LHS->getType());
	FoundRHS = getZeroExtendExpr(FoundRHS, LHS->getType());
	}
	}

	// Canonicalize the query to match the way instcombine will have
	// canonicalized the comparison.
	if (SimplifyICmpOperands(Pred, LHS, RHS))
	if (LHS == RHS)
	return CmpInst::isTrueWhenEqual(Pred);
	if (SimplifyICmpOperands(FoundPred, FoundLHS, FoundRHS))
	if (FoundLHS == FoundRHS)
	return CmpInst::isFalseWhenEqual(FoundPred);

	// Check to see if we can make the LHS or RHS match.
	if (LHS == FoundRHS \|\| RHS == FoundLHS) {
	if (isa<SCEVConstant>(RHS)) {
	std::swap(FoundLHS, FoundRHS);
	FoundPred = ICmpInst::getSwappedPredicate(FoundPred);
	} else {
	std::swap(LHS, RHS);
	Pred = ICmpInst::getSwappedPredicate(Pred);
	}
	}

	// Check whether the found predicate is the same as the desired predicate.
	if (FoundPred == Pred)
	return isImpliedCondOperands(Pred, LHS, RHS, FoundLHS, FoundRHS);

	// Check whether swapping the found predicate makes it the same as the
	// desired predicate.
	if (ICmpInst::getSwappedPredicate(FoundPred) == Pred) {
	if (isa<SCEVConstant>(RHS))
	return isImpliedCondOperands(Pred, LHS, RHS, FoundRHS, FoundLHS);
	else
	return isImpliedCondOperands(ICmpInst::getSwappedPredicate(Pred),
	RHS, LHS, FoundLHS, FoundRHS);
	}

	// Unsigned comparison is the same as signed comparison when both the operands
	// are non-negative.
	if (CmpInst::isUnsigned(FoundPred) &&
	CmpInst::getSignedPredicate(FoundPred) == Pred &&
	isKnownNonNegative(FoundLHS) && isKnownNonNegative(FoundRHS))
	return isImpliedCondOperands(Pred, LHS, RHS, FoundLHS, FoundRHS);

	// Check if we can make progress by sharpening ranges.
	if (FoundPred == ICmpInst::ICMP_NE &&
	(isa<SCEVConstant>(FoundLHS) \|\| isa<SCEVConstant>(FoundRHS))) {

	const SCEVConstant *C = nullptr;
	const SCEV *V = nullptr;

	if (isa<SCEVConstant>(FoundLHS)) {
	C = cast<SCEVConstant>(FoundLHS);
	V = FoundRHS;
	} else {
	C = cast<SCEVConstant>(FoundRHS);
	V = FoundLHS;
	}

	// The guarding predicate tells us that C != V. If the known range
	// of V is [C, t), we can sharpen the range to [C + 1, t). The
	// range we consider has to correspond to same signedness as the
	// predicate we're interested in folding.

	APInt Min = ICmpInst::isSigned(Pred) ?
	getSignedRangeMin(V) : getUnsignedRangeMin(V);

	if (Min == C->getAPInt()) {
	// Given (V >= Min && V != Min) we conclude V >= (Min + 1).
	// This is true even if (Min + 1) wraps around -- in case of
	// wraparound, (Min + 1) < Min, so (V >= Min => V >= (Min + 1)).

	APInt SharperMin = Min + 1;

	switch (Pred) {
	case ICmpInst::ICMP_SGE:
	case ICmpInst::ICMP_UGE:
	// We know V `Pred` SharperMin. If this implies LHS `Pred`
	// RHS, we're done.
	if (isImpliedCondOperands(Pred, LHS, RHS, V,
	getConstant(SharperMin)))
	return true;
	LLVM_FALLTHROUGH;

	case ICmpInst::ICMP_SGT:
	case ICmpInst::ICMP_UGT:
	// We know from the range information that (V `Pred` Min \|\|
	// V == Min). We know from the guarding condition that !(V
	// == Min). This gives us
	//
	// V `Pred` Min \|\| V == Min && !(V == Min)
	// => V `Pred` Min
	//
	// If V `Pred` Min implies LHS `Pred` RHS, we're done.

	if (isImpliedCondOperands(Pred, LHS, RHS, V, getConstant(Min)))
	return true;
	LLVM_FALLTHROUGH;

	default:
	// No change
	break;
	}
	}
	}

	// Check whether the actual condition is beyond sufficient.
	if (FoundPred == ICmpInst::ICMP_EQ)
	if (ICmpInst::isTrueWhenEqual(Pred))
	if (isImpliedCondOperands(Pred, LHS, RHS, FoundLHS, FoundRHS))
	return true;
	if (Pred == ICmpInst::ICMP_NE)
	if (!ICmpInst::isTrueWhenEqual(FoundPred))
	if (isImpliedCondOperands(FoundPred, LHS, RHS, FoundLHS, FoundRHS))
	return true;

	// Otherwise assume the worst.
	return false;
	}

	bool ScalarEvolution::splitBinaryAdd(const SCEV *Expr,
	const SCEV &L, const SCEV &R,
	SCEV::NoWrapFlags &Flags) {
	const auto *AE = dyn_cast<SCEVAddExpr>(Expr);
	if (!AE \|\| AE->getNumOperands() != 2)
	return false;

	L = AE->getOperand(0);
	R = AE->getOperand(1);
	Flags = AE->getNoWrapFlags();
	return true;
	}

	Optional<APInt> ScalarEvolution::computeConstantDifference(const SCEV *More,
	const SCEV *Less) {
	// We avoid subtracting expressions here because this function is usually
	// fairly deep in the call stack (i.e. is called many times).

	if (isa<SCEVAddRecExpr>(Less) && isa<SCEVAddRecExpr>(More)) {
	const auto *LAR = cast<SCEVAddRecExpr>(Less);
	const auto *MAR = cast<SCEVAddRecExpr>(More);

	if (LAR->getLoop() != MAR->getLoop())
	return None;

	// We look at affine expressions only; not for correctness but to keep
	// getStepRecurrence cheap.
	if (!LAR->isAffine() \|\| !MAR->isAffine())
	return None;

	if (LAR->getStepRecurrence(this) != MAR->getStepRecurrence(this))
	return None;

	Less = LAR->getStart();
	More = MAR->getStart();

	// fall through
	}

	if (isa<SCEVConstant>(Less) && isa<SCEVConstant>(More)) {
	const auto &M = cast<SCEVConstant>(More)->getAPInt();
	const auto &L = cast<SCEVConstant>(Less)->getAPInt();
	return M - L;
	}

	const SCEV L, R;
	SCEV::NoWrapFlags Flags;
	if (splitBinaryAdd(Less, L, R, Flags))
	if (const auto *LC = dyn_cast<SCEVConstant>(L))
	if (R == More)
	return -(LC->getAPInt());

	if (splitBinaryAdd(More, L, R, Flags))
	if (const auto *LC = dyn_cast<SCEVConstant>(L))
	if (R == Less)
	return LC->getAPInt();

	return None;
	}

	bool ScalarEvolution::isImpliedCondOperandsViaNoOverflow(
	ICmpInst::Predicate Pred, const SCEV LHS, const SCEV RHS,
	const SCEV FoundLHS, const SCEV FoundRHS) {
	if (Pred != CmpInst::ICMP_SLT && Pred != CmpInst::ICMP_ULT)
	return false;

	const auto *AddRecLHS = dyn_cast<SCEVAddRecExpr>(LHS);
	if (!AddRecLHS)
	return false;

	const auto *AddRecFoundLHS = dyn_cast<SCEVAddRecExpr>(FoundLHS);
	if (!AddRecFoundLHS)
	return false;

	// We'd like to let SCEV reason about control dependencies, so we constrain
	// both the inequalities to be about add recurrences on the same loop. This
	// way we can use isLoopEntryGuardedByCond later.

	const Loop *L = AddRecFoundLHS->getLoop();
	if (L != AddRecLHS->getLoop())
	return false;

	// FoundLHS u< FoundRHS u< -C => (FoundLHS + C) u< (FoundRHS + C) ... (1)
	//
	// FoundLHS s< FoundRHS s< INT_MIN - C => (FoundLHS + C) s< (FoundRHS + C)
	// ... (2)
	//
	// Informal proof for (2), assuming (1) [*]:
	//
	// We'll also assume (A s< B) <=> ((A + INT_MIN) u< (B + INT_MIN)) ... (3)[**]
	//
	// Then
	//
	// FoundLHS s< FoundRHS s< INT_MIN - C
	// <=> (FoundLHS + INT_MIN) u< (FoundRHS + INT_MIN) u< -C [ using (3) ]
	// <=> (FoundLHS + INT_MIN + C) u< (FoundRHS + INT_MIN + C) [ using (1) ]
	// <=> (FoundLHS + INT_MIN + C + INT_MIN) s<
	// (FoundRHS + INT_MIN + C + INT_MIN) [ using (3) ]
	// <=> FoundLHS + C s< FoundRHS + C
	//
	// [*]: (1) can be proved by ruling out overflow.
	//
	// [**]: This can be proved by analyzing all the four possibilities:
	// (A s< 0, B s< 0), (A s< 0, B s>= 0), (A s>= 0, B s< 0) and
	// (A s>= 0, B s>= 0).
	//
	// Note:
	// Despite (2), "FoundRHS s< INT_MIN - C" does not mean that "FoundRHS + C"
	// will not sign underflow. For instance, say FoundLHS = (i8 -128), FoundRHS
	// = (i8 -127) and C = (i8 -100). Then INT_MIN - C = (i8 -28), and FoundRHS
	// s< (INT_MIN - C). Lack of sign overflow / underflow in "FoundRHS + C" is
	// neither necessary nor sufficient to prove "(FoundLHS + C) s< (FoundRHS +
	// C)".

	Optional<APInt> LDiff = computeConstantDifference(LHS, FoundLHS);
	Optional<APInt> RDiff = computeConstantDifference(RHS, FoundRHS);
	if (!LDiff \|\| !RDiff \|\| LDiff != RDiff)
	return false;

	if (LDiff->isMinValue())
	return true;

	APInt FoundRHSLimit;

	if (Pred == CmpInst::ICMP_ULT) {
	FoundRHSLimit = -(*RDiff);
	} else {
	assert(Pred == CmpInst::ICMP_SLT && "Checked above!");
	FoundRHSLimit = APInt::getSignedMinValue(getTypeSizeInBits(RHS->getType())) - *RDiff;
	}

	// Try to prove (1) or (2), as needed.
	return isLoopEntryGuardedByCond(L, Pred, FoundRHS,
	getConstant(FoundRHSLimit));
	}

	bool ScalarEvolution::isImpliedCondOperands(ICmpInst::Predicate Pred,
	const SCEV LHS, const SCEV RHS,
	const SCEV *FoundLHS,
	const SCEV *FoundRHS) {
	if (isImpliedCondOperandsViaRanges(Pred, LHS, RHS, FoundLHS, FoundRHS))
	return true;

	if (isImpliedCondOperandsViaNoOverflow(Pred, LHS, RHS, FoundLHS, FoundRHS))
	return true;

	return isImpliedCondOperandsHelper(Pred, LHS, RHS,
	FoundLHS, FoundRHS) \|\|
	// ~x < ~y --> x > y
	isImpliedCondOperandsHelper(Pred, LHS, RHS,
	getNotSCEV(FoundRHS),
	getNotSCEV(FoundLHS));
	}


	/// If Expr computes ~A, return A else return nullptr
	static const SCEV MatchNotExpr(const SCEV Expr) {
	const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(Expr);
	if (!Add \|\| Add->getNumOperands() != 2 \|\|
	!Add->getOperand(0)->isAllOnesValue())
	return nullptr;

	const SCEVMulExpr *AddRHS = dyn_cast<SCEVMulExpr>(Add->getOperand(1));
	if (!AddRHS \|\| AddRHS->getNumOperands() != 2 \|\|
	!AddRHS->getOperand(0)->isAllOnesValue())
	return nullptr;

	return AddRHS->getOperand(1);
	}


	/// Is MaybeMaxExpr an SMax or UMax of Candidate and some other values?
	template<typename MaxExprType>
	static bool IsMaxConsistingOf(const SCEV *MaybeMaxExpr,
	const SCEV *Candidate) {
	const MaxExprType *MaxExpr = dyn_cast<MaxExprType>(MaybeMaxExpr);
	if (!MaxExpr) return false;

	return find(MaxExpr->operands(), Candidate) != MaxExpr->op_end();
	}


	/// Is MaybeMinExpr an SMin or UMin of Candidate and some other values?
	template<typename MaxExprType>
	static bool IsMinConsistingOf(ScalarEvolution &SE,
	const SCEV *MaybeMinExpr,
	const SCEV *Candidate) {
	const SCEV *MaybeMaxExpr = MatchNotExpr(MaybeMinExpr);
	if (!MaybeMaxExpr)
	return false;

	return IsMaxConsistingOf<MaxExprType>(MaybeMaxExpr, SE.getNotSCEV(Candidate));
	}

	static bool IsKnownPredicateViaAddRecStart(ScalarEvolution &SE,
	ICmpInst::Predicate Pred,
	const SCEV LHS, const SCEV RHS) {

	// If both sides are affine addrecs for the same loop, with equal
	// steps, and we know the recurrences don't wrap, then we only
	// need to check the predicate on the starting values.

	if (!ICmpInst::isRelational(Pred))
	return false;

	const SCEVAddRecExpr *LAR = dyn_cast<SCEVAddRecExpr>(LHS);
	if (!LAR)
	return false;
	const SCEVAddRecExpr *RAR = dyn_cast<SCEVAddRecExpr>(RHS);
	if (!RAR)
	return false;
	if (LAR->getLoop() != RAR->getLoop())
	return false;
	if (!LAR->isAffine() \|\| !RAR->isAffine())
	return false;

	if (LAR->getStepRecurrence(SE) != RAR->getStepRecurrence(SE))
	return false;

	SCEV::NoWrapFlags NW = ICmpInst::isSigned(Pred) ?
	SCEV::FlagNSW : SCEV::FlagNUW;
	if (!LAR->getNoWrapFlags(NW) \|\| !RAR->getNoWrapFlags(NW))
	return false;

	return SE.isKnownPredicate(Pred, LAR->getStart(), RAR->getStart());
	}

	/// Is LHS `Pred` RHS true on the virtue of LHS or RHS being a Min or Max
	/// expression?
	static bool IsKnownPredicateViaMinOrMax(ScalarEvolution &SE,
	ICmpInst::Predicate Pred,
	const SCEV LHS, const SCEV RHS) {
	switch (Pred) {
	default:
	return false;

	case ICmpInst::ICMP_SGE:
	std::swap(LHS, RHS);
	LLVM_FALLTHROUGH;
	case ICmpInst::ICMP_SLE:
	return
	// min(A, ...) <= A
	IsMinConsistingOf<SCEVSMaxExpr>(SE, LHS, RHS) \|\|
	// A <= max(A, ...)
	IsMaxConsistingOf<SCEVSMaxExpr>(RHS, LHS);

	case ICmpInst::ICMP_UGE:
	std::swap(LHS, RHS);
	LLVM_FALLTHROUGH;
	case ICmpInst::ICMP_ULE:
	return
	// min(A, ...) <= A
	IsMinConsistingOf<SCEVUMaxExpr>(SE, LHS, RHS) \|\|
	// A <= max(A, ...)
	IsMaxConsistingOf<SCEVUMaxExpr>(RHS, LHS);
	}

	llvm_unreachable("covered switch fell through?!");
	}

	bool ScalarEvolution::isImpliedViaOperations(ICmpInst::Predicate Pred,
	const SCEV LHS, const SCEV RHS,
	const SCEV *FoundLHS,
	const SCEV *FoundRHS,
	unsigned Depth) {
	assert(getTypeSizeInBits(LHS->getType()) ==
	getTypeSizeInBits(RHS->getType()) &&
	"LHS and RHS have different sizes?");
	assert(getTypeSizeInBits(FoundLHS->getType()) ==
	getTypeSizeInBits(FoundRHS->getType()) &&
	"FoundLHS and FoundRHS have different sizes?");
	// We want to avoid hurting the compile time with analysis of too big trees.
	if (Depth > MaxSCEVOperationsImplicationDepth)
	return false;
	// We only want to work with ICMP_SGT comparison so far.
	// TODO: Extend to ICMP_UGT?
	if (Pred == ICmpInst::ICMP_SLT) {
	Pred = ICmpInst::ICMP_SGT;
	std::swap(LHS, RHS);
	std::swap(FoundLHS, FoundRHS);
	}
	if (Pred != ICmpInst::ICMP_SGT)
	return false;

	auto GetOpFromSExt = [&](const SCEV *S) {
	if (auto *Ext = dyn_cast<SCEVSignExtendExpr>(S))
	return Ext->getOperand();
	// TODO: If S is a SCEVConstant then you can cheaply "strip" the sext off
	// the constant in some cases.
	return S;
	};

	// Acquire values from extensions.
	auto *OrigFoundLHS = FoundLHS;
	LHS = GetOpFromSExt(LHS);
	FoundLHS = GetOpFromSExt(FoundLHS);

	// Is the SGT predicate can be proved trivially or using the found context.
	auto IsSGTViaContext = [&](const SCEV S1, const SCEV S2) {
	return isKnownViaSimpleReasoning(ICmpInst::ICMP_SGT, S1, S2) \|\|
	isImpliedViaOperations(ICmpInst::ICMP_SGT, S1, S2, OrigFoundLHS,
	FoundRHS, Depth + 1);
	};

	if (auto *LHSAddExpr = dyn_cast<SCEVAddExpr>(LHS)) {
	// We want to avoid creation of any new non-constant SCEV. Since we are
	// going to compare the operands to RHS, we should be certain that we don't
	// need any size extensions for this. So let's decline all cases when the
	// sizes of types of LHS and RHS do not match.
	// TODO: Maybe try to get RHS from sext to catch more cases?
	if (getTypeSizeInBits(LHS->getType()) != getTypeSizeInBits(RHS->getType()))
	return false;

	// Should not overflow.
	if (!LHSAddExpr->hasNoSignedWrap())
	return false;

	auto *LL = LHSAddExpr->getOperand(0);
	auto *LR = LHSAddExpr->getOperand(1);
	auto *MinusOne = getNegativeSCEV(getOne(RHS->getType()));

	// Checks that S1 >= 0 && S2 > RHS, trivially or using the found context.
	auto IsSumGreaterThanRHS = [&](const SCEV S1, const SCEV S2) {
	return IsSGTViaContext(S1, MinusOne) && IsSGTViaContext(S2, RHS);
	};
	// Try to prove the following rule:
	// (LHS = LL + LR) && (LL >= 0) && (LR > RHS) => (LHS > RHS).
	// (LHS = LL + LR) && (LR >= 0) && (LL > RHS) => (LHS > RHS).
	if (IsSumGreaterThanRHS(LL, LR) \|\| IsSumGreaterThanRHS(LR, LL))
	return true;
	} else if (auto *LHSUnknownExpr = dyn_cast<SCEVUnknown>(LHS)) {
	Value LL, LR;
	// FIXME: Once we have SDiv implemented, we can get rid of this matching.
	using namespace llvm::PatternMatch;
	if (match(LHSUnknownExpr->getValue(), m_SDiv(m_Value(LL), m_Value(LR)))) {
	// Rules for division.
	// We are going to perform some comparisons with Denominator and its
	// derivative expressions. In general case, creating a SCEV for it may
	// lead to a complex analysis of the entire graph, and in particular it
	// can request trip count recalculation for the same loop. This would
	// cache as SCEVCouldNotCompute to avoid the infinite recursion. To avoid
	// this, we only want to create SCEVs that are constants in this section.
	// So we bail if Denominator is not a constant.
	if (!isa<ConstantInt>(LR))
	return false;

	auto *Denominator = cast<SCEVConstant>(getSCEV(LR));

	// We want to make sure that LHS = FoundLHS / Denominator. If it is so,
	// then a SCEV for the numerator already exists and matches with FoundLHS.
	auto *Numerator = getExistingSCEV(LL);
	if (!Numerator \|\| Numerator->getType() != FoundLHS->getType())
	return false;

	// Make sure that the numerator matches with FoundLHS and the denominator
	// is positive.
	if (!HasSameValue(Numerator, FoundLHS) \|\| !isKnownPositive(Denominator))
	return false;

	auto *DTy = Denominator->getType();
	auto *FRHSTy = FoundRHS->getType();
	if (DTy->isPointerTy() != FRHSTy->isPointerTy())
	// One of types is a pointer and another one is not. We cannot extend
	// them properly to a wider type, so let us just reject this case.
	// TODO: Usage of getEffectiveSCEVType for DTy, FRHSTy etc should help
	// to avoid this check.
	return false;

	// Given that:
	// FoundLHS > FoundRHS, LHS = FoundLHS / Denominator, Denominator > 0.
	auto *WTy = getWiderType(DTy, FRHSTy);
	auto *DenominatorExt = getNoopOrSignExtend(Denominator, WTy);
	auto *FoundRHSExt = getNoopOrSignExtend(FoundRHS, WTy);

	// Try to prove the following rule:
	// (FoundRHS > Denominator - 2) && (RHS <= 0) => (LHS > RHS).
	// For example, given that FoundLHS > 2. It means that FoundLHS is at
	// least 3. If we divide it by Denominator < 4, we will have at least 1.
	auto *DenomMinusTwo = getMinusSCEV(DenominatorExt, getConstant(WTy, 2));
	if (isKnownNonPositive(RHS) &&
	IsSGTViaContext(FoundRHSExt, DenomMinusTwo))
	return true;

	// Try to prove the following rule:
	// (FoundRHS > -1 - Denominator) && (RHS < 0) => (LHS > RHS).
	// For example, given that FoundLHS > -3. Then FoundLHS is at least -2.
	// If we divide it by Denominator > 2, then:
	// 1. If FoundLHS is negative, then the result is 0.
	// 2. If FoundLHS is non-negative, then the result is non-negative.
	// Anyways, the result is non-negative.
	auto *MinusOne = getNegativeSCEV(getOne(WTy));
	auto *NegDenomMinusOne = getMinusSCEV(MinusOne, DenominatorExt);
	if (isKnownNegative(RHS) &&
	IsSGTViaContext(FoundRHSExt, NegDenomMinusOne))
	return true;
	}
	}

	return false;
	}

	bool
	ScalarEvolution::isKnownViaSimpleReasoning(ICmpInst::Predicate Pred,
	const SCEV LHS, const SCEV RHS) {
	return isKnownPredicateViaConstantRanges(Pred, LHS, RHS) \|\|
	IsKnownPredicateViaMinOrMax(*this, Pred, LHS, RHS) \|\|
	IsKnownPredicateViaAddRecStart(*this, Pred, LHS, RHS) \|\|
	isKnownPredicateViaNoOverflow(Pred, LHS, RHS);
	}

	bool
	ScalarEvolution::isImpliedCondOperandsHelper(ICmpInst::Predicate Pred,
	const SCEV LHS, const SCEV RHS,
	const SCEV *FoundLHS,
	const SCEV *FoundRHS) {
	switch (Pred) {
	default: llvm_unreachable("Unexpected ICmpInst::Predicate value!");
	case ICmpInst::ICMP_EQ:
	case ICmpInst::ICMP_NE:
	if (HasSameValue(LHS, FoundLHS) && HasSameValue(RHS, FoundRHS))
	return true;
	break;
	case ICmpInst::ICMP_SLT:
	case ICmpInst::ICMP_SLE:
	if (isKnownViaSimpleReasoning(ICmpInst::ICMP_SLE, LHS, FoundLHS) &&
	isKnownViaSimpleReasoning(ICmpInst::ICMP_SGE, RHS, FoundRHS))
	return true;
	break;
	case ICmpInst::ICMP_SGT:
	case ICmpInst::ICMP_SGE:
	if (isKnownViaSimpleReasoning(ICmpInst::ICMP_SGE, LHS, FoundLHS) &&
	isKnownViaSimpleReasoning(ICmpInst::ICMP_SLE, RHS, FoundRHS))
	return true;
	break;
	case ICmpInst::ICMP_ULT:
	case ICmpInst::ICMP_ULE:
	if (isKnownViaSimpleReasoning(ICmpInst::ICMP_ULE, LHS, FoundLHS) &&
	isKnownViaSimpleReasoning(ICmpInst::ICMP_UGE, RHS, FoundRHS))
	return true;
	break;
	case ICmpInst::ICMP_UGT:
	case ICmpInst::ICMP_UGE:
	if (isKnownViaSimpleReasoning(ICmpInst::ICMP_UGE, LHS, FoundLHS) &&
	isKnownViaSimpleReasoning(ICmpInst::ICMP_ULE, RHS, FoundRHS))
	return true;
	break;
	}

	// Maybe it can be proved via operations?
	if (isImpliedViaOperations(Pred, LHS, RHS, FoundLHS, FoundRHS))
	return true;

	return false;
	}

	bool ScalarEvolution::isImpliedCondOperandsViaRanges(ICmpInst::Predicate Pred,
	const SCEV *LHS,
	const SCEV *RHS,
	const SCEV *FoundLHS,
	const SCEV *FoundRHS) {
	if (!isa<SCEVConstant>(RHS) \|\| !isa<SCEVConstant>(FoundRHS))
	// The restriction on `FoundRHS` be lifted easily -- it exists only to
	// reduce the compile time impact of this optimization.
	return false;

	Optional<APInt> Addend = computeConstantDifference(LHS, FoundLHS);
	if (!Addend)
	return false;

	const APInt &ConstFoundRHS = cast<SCEVConstant>(FoundRHS)->getAPInt();

	// `FoundLHSRange` is the range we know `FoundLHS` to be in by virtue of the
	// antecedent "`FoundLHS` `Pred` `FoundRHS`".
	ConstantRange FoundLHSRange =
	ConstantRange::makeAllowedICmpRegion(Pred, ConstFoundRHS);

	// Since `LHS` is `FoundLHS` + `Addend`, we can compute a range for `LHS`:
	ConstantRange LHSRange = FoundLHSRange.add(ConstantRange(*Addend));

	// We can also compute the range of values for `LHS` that satisfy the
	// consequent, "`LHS` `Pred` `RHS`":
	const APInt &ConstRHS = cast<SCEVConstant>(RHS)->getAPInt();
	ConstantRange SatisfyingLHSRange =
	ConstantRange::makeSatisfyingICmpRegion(Pred, ConstRHS);

	// The antecedent implies the consequent if every value of `LHS` that
	// satisfies the antecedent also satisfies the consequent.
	return SatisfyingLHSRange.contains(LHSRange);
	}

	bool ScalarEvolution::doesIVOverflowOnLT(const SCEV RHS, const SCEV Stride,
	bool IsSigned, bool NoWrap) {
	assert(isKnownPositive(Stride) && "Positive stride expected!");

	if (NoWrap) return false;

	unsigned BitWidth = getTypeSizeInBits(RHS->getType());
	const SCEV *One = getOne(Stride->getType());

	if (IsSigned) {
	APInt MaxRHS = getSignedRangeMax(RHS);
	APInt MaxValue = APInt::getSignedMaxValue(BitWidth);
	APInt MaxStrideMinusOne = getSignedRangeMax(getMinusSCEV(Stride, One));

	// SMaxRHS + SMaxStrideMinusOne > SMaxValue => overflow!
	return (std::move(MaxValue) - MaxStrideMinusOne).slt(MaxRHS);
	}

	APInt MaxRHS = getUnsignedRangeMax(RHS);
	APInt MaxValue = APInt::getMaxValue(BitWidth);
	APInt MaxStrideMinusOne = getUnsignedRangeMax(getMinusSCEV(Stride, One));

	// UMaxRHS + UMaxStrideMinusOne > UMaxValue => overflow!
	return (std::move(MaxValue) - MaxStrideMinusOne).ult(MaxRHS);
	}

	bool ScalarEvolution::doesIVOverflowOnGT(const SCEV RHS, const SCEV Stride,
	bool IsSigned, bool NoWrap) {
	if (NoWrap) return false;

	unsigned BitWidth = getTypeSizeInBits(RHS->getType());
	const SCEV *One = getOne(Stride->getType());

	if (IsSigned) {
	APInt MinRHS = getSignedRangeMin(RHS);
	APInt MinValue = APInt::getSignedMinValue(BitWidth);
	APInt MaxStrideMinusOne = getSignedRangeMax(getMinusSCEV(Stride, One));

	// SMinRHS - SMaxStrideMinusOne < SMinValue => overflow!
	return (std::move(MinValue) + MaxStrideMinusOne).sgt(MinRHS);
	}

	APInt MinRHS = getUnsignedRangeMin(RHS);
	APInt MinValue = APInt::getMinValue(BitWidth);
	APInt MaxStrideMinusOne = getUnsignedRangeMax(getMinusSCEV(Stride, One));

	// UMinRHS - UMaxStrideMinusOne < UMinValue => overflow!
	return (std::move(MinValue) + MaxStrideMinusOne).ugt(MinRHS);
	}

	const SCEV ScalarEvolution::computeBECount(const SCEV Delta, const SCEV *Step,
	bool Equality) {
	const SCEV *One = getOne(Step->getType());
	Delta = Equality ? getAddExpr(Delta, Step)
	: getAddExpr(Delta, getMinusSCEV(Step, One));
	return getUDivExpr(Delta, Step);
	}

	ScalarEvolution::ExitLimit
	ScalarEvolution::howManyLessThans(const SCEV LHS, const SCEV RHS,
	const Loop *L, bool IsSigned,
	bool ControlsExit, bool AllowPredicates) {
	SmallPtrSet<const SCEVPredicate *, 4> Predicates;
	// We handle only IV < Invariant
	if (!isLoopInvariant(RHS, L))
	return getCouldNotCompute();

	const SCEVAddRecExpr *IV = dyn_cast<SCEVAddRecExpr>(LHS);
	bool PredicatedIV = false;

	if (!IV && AllowPredicates) {
	// Try to make this an AddRec using runtime tests, in the first X
	// iterations of this loop, where X is the SCEV expression found by the
	// algorithm below.
	IV = convertSCEVToAddRecWithPredicates(LHS, L, Predicates);
	PredicatedIV = true;
	}

	// Avoid weird loops
	if (!IV \|\| IV->getLoop() != L \|\| !IV->isAffine())
	return getCouldNotCompute();

	bool NoWrap = ControlsExit &&
	IV->getNoWrapFlags(IsSigned ? SCEV::FlagNSW : SCEV::FlagNUW);

	const SCEV Stride = IV->getStepRecurrence(this);

	bool PositiveStride = isKnownPositive(Stride);

	// Avoid negative or zero stride values.
	if (!PositiveStride) {
	// We can compute the correct backedge taken count for loops with unknown
	// strides if we can prove that the loop is not an infinite loop with side
	// effects. Here's the loop structure we are trying to handle -
	//
	// i = start
	// do {
	// A[i] = i;
	// i += s;
	// } while (i < end);
	//
	// The backedge taken count for such loops is evaluated as -
	// (max(end, start + stride) - start - 1) /u stride
	//
	// The additional preconditions that we need to check to prove correctness
	// of the above formula is as follows -
	//
	// a) IV is either nuw or nsw depending upon signedness (indicated by the
	// NoWrap flag).
	// b) loop is single exit with no side effects.
	//
	//
	// Precondition a) implies that if the stride is negative, this is a single
	// trip loop. The backedge taken count formula reduces to zero in this case.
	//
	// Precondition b) implies that the unknown stride cannot be zero otherwise
	// we have UB.
	//
	// The positive stride case is the same as isKnownPositive(Stride) returning
	// true (original behavior of the function).
	//
	// We want to make sure that the stride is truly unknown as there are edge
	// cases where ScalarEvolution propagates no wrap flags to the
	// post-increment/decrement IV even though the increment/decrement operation
	// itself is wrapping. The computed backedge taken count may be wrong in
	// such cases. This is prevented by checking that the stride is not known to
	// be either positive or non-positive. For example, no wrap flags are
	// propagated to the post-increment IV of this loop with a trip count of 2 -
	//
	// unsigned char i;
	// for(i=127; i<128; i+=129)
	// A[i] = i;
	//
	if (PredicatedIV \|\| !NoWrap \|\| isKnownNonPositive(Stride) \|\|
	!loopHasNoSideEffects(L))
	return getCouldNotCompute();

	} else if (!Stride->isOne() &&
	doesIVOverflowOnLT(RHS, Stride, IsSigned, NoWrap))
	// Avoid proven overflow cases: this will ensure that the backedge taken
	// count will not generate any unsigned overflow. Relaxed no-overflow
	// conditions exploit NoWrapFlags, allowing to optimize in presence of
	// undefined behaviors like the case of C language.
	return getCouldNotCompute();

	ICmpInst::Predicate Cond = IsSigned ? ICmpInst::ICMP_SLT
	: ICmpInst::ICMP_ULT;
	const SCEV *Start = IV->getStart();
	const SCEV *End = RHS;
	// If the backedge is taken at least once, then it will be taken
	// (End-Start)/Stride times (rounded up to a multiple of Stride), where Start
	// is the LHS value of the less-than comparison the first time it is evaluated
	// and End is the RHS.
	const SCEV *BECountIfBackedgeTaken =
	computeBECount(getMinusSCEV(End, Start), Stride, false);
	// If the loop entry is guarded by the result of the backedge test of the
	// first loop iteration, then we know the backedge will be taken at least
	// once and so the backedge taken count is as above. If not then we use the
	// expression (max(End,Start)-Start)/Stride to describe the backedge count,
	// as if the backedge is taken at least once max(End,Start) is End and so the
	// result is as above, and if not max(End,Start) is Start so we get a backedge
	// count of zero.
	const SCEV *BECount;
	if (isLoopEntryGuardedByCond(L, Cond, getMinusSCEV(Start, Stride), RHS))
	BECount = BECountIfBackedgeTaken;
	else {
	End = IsSigned ? getSMaxExpr(RHS, Start) : getUMaxExpr(RHS, Start);
	BECount = computeBECount(getMinusSCEV(End, Start), Stride, false);
	}

	const SCEV *MaxBECount;
	bool MaxOrZero = false;
	if (isa<SCEVConstant>(BECount))
	MaxBECount = BECount;
	else if (isa<SCEVConstant>(BECountIfBackedgeTaken)) {
	// If we know exactly how many times the backedge will be taken if it's
	// taken at least once, then the backedge count will either be that or
	// zero.
	MaxBECount = BECountIfBackedgeTaken;
	MaxOrZero = true;
	} else {
	// Calculate the maximum backedge count based on the range of values
	// permitted by Start, End, and Stride.
	APInt MinStart = IsSigned ? getSignedRangeMin(Start)
	: getUnsignedRangeMin(Start);

	unsigned BitWidth = getTypeSizeInBits(LHS->getType());

	APInt StrideForMaxBECount;

	if (PositiveStride)
	StrideForMaxBECount =
	IsSigned ? getSignedRangeMin(Stride)
	: getUnsignedRangeMin(Stride);
	else
	// Using a stride of 1 is safe when computing max backedge taken count for
	// a loop with unknown stride.
	StrideForMaxBECount = APInt(BitWidth, 1, IsSigned);

	APInt Limit =
	IsSigned ? APInt::getSignedMaxValue(BitWidth) - (StrideForMaxBECount - 1)
	: APInt::getMaxValue(BitWidth) - (StrideForMaxBECount - 1);

	// Although End can be a MAX expression we estimate MaxEnd considering only
	// the case End = RHS. This is safe because in the other case (End - Start)
	// is zero, leading to a zero maximum backedge taken count.
	APInt MaxEnd =
	IsSigned ? APIntOps::smin(getSignedRangeMax(RHS), Limit)
	: APIntOps::umin(getUnsignedRangeMax(RHS), Limit);

	MaxBECount = computeBECount(getConstant(MaxEnd - MinStart),
	getConstant(StrideForMaxBECount), false);
	}

	if (isa<SCEVCouldNotCompute>(MaxBECount) &&
	!isa<SCEVCouldNotCompute>(BECount))
	MaxBECount = getConstant(getUnsignedRangeMax(BECount));

	return ExitLimit(BECount, MaxBECount, MaxOrZero, Predicates);
	}

	ScalarEvolution::ExitLimit
	ScalarEvolution::howManyGreaterThans(const SCEV LHS, const SCEV RHS,
	const Loop *L, bool IsSigned,
	bool ControlsExit, bool AllowPredicates) {
	SmallPtrSet<const SCEVPredicate *, 4> Predicates;
	// We handle only IV > Invariant
	if (!isLoopInvariant(RHS, L))
	return getCouldNotCompute();

	const SCEVAddRecExpr *IV = dyn_cast<SCEVAddRecExpr>(LHS);
	if (!IV && AllowPredicates)
	// Try to make this an AddRec using runtime tests, in the first X
	// iterations of this loop, where X is the SCEV expression found by the
	// algorithm below.
	IV = convertSCEVToAddRecWithPredicates(LHS, L, Predicates);

	// Avoid weird loops
	if (!IV \|\| IV->getLoop() != L \|\| !IV->isAffine())
	return getCouldNotCompute();

	bool NoWrap = ControlsExit &&
	IV->getNoWrapFlags(IsSigned ? SCEV::FlagNSW : SCEV::FlagNUW);

	const SCEV Stride = getNegativeSCEV(IV->getStepRecurrence(this));

	// Avoid negative or zero stride values
	if (!isKnownPositive(Stride))
	return getCouldNotCompute();

	// Avoid proven overflow cases: this will ensure that the backedge taken count
	// will not generate any unsigned overflow. Relaxed no-overflow conditions
	// exploit NoWrapFlags, allowing to optimize in presence of undefined
	// behaviors like the case of C language.
	if (!Stride->isOne() && doesIVOverflowOnGT(RHS, Stride, IsSigned, NoWrap))
	return getCouldNotCompute();

	ICmpInst::Predicate Cond = IsSigned ? ICmpInst::ICMP_SGT
	: ICmpInst::ICMP_UGT;

	const SCEV *Start = IV->getStart();
	const SCEV *End = RHS;
	if (!isLoopEntryGuardedByCond(L, Cond, getAddExpr(Start, Stride), RHS))
	End = IsSigned ? getSMinExpr(RHS, Start) : getUMinExpr(RHS, Start);

	const SCEV *BECount = computeBECount(getMinusSCEV(Start, End), Stride, false);

	APInt MaxStart = IsSigned ? getSignedRangeMax(Start)
	: getUnsignedRangeMax(Start);

	APInt MinStride = IsSigned ? getSignedRangeMin(Stride)
	: getUnsignedRangeMin(Stride);

	unsigned BitWidth = getTypeSizeInBits(LHS->getType());
	APInt Limit = IsSigned ? APInt::getSignedMinValue(BitWidth) + (MinStride - 1)
	: APInt::getMinValue(BitWidth) + (MinStride - 1);

	// Although End can be a MIN expression we estimate MinEnd considering only
	// the case End = RHS. This is safe because in the other case (Start - End)
	// is zero, leading to a zero maximum backedge taken count.
	APInt MinEnd =
	IsSigned ? APIntOps::smax(getSignedRangeMin(RHS), Limit)
	: APIntOps::umax(getUnsignedRangeMin(RHS), Limit);


	const SCEV *MaxBECount = getCouldNotCompute();
	if (isa<SCEVConstant>(BECount))
	MaxBECount = BECount;
	else
	MaxBECount = computeBECount(getConstant(MaxStart - MinEnd),
	getConstant(MinStride), false);

	if (isa<SCEVCouldNotCompute>(MaxBECount))
	MaxBECount = BECount;

	return ExitLimit(BECount, MaxBECount, false, Predicates);
	}

	const SCEV *SCEVAddRecExpr::getNumIterationsInRange(const ConstantRange &Range,
	ScalarEvolution &SE) const {
	if (Range.isFullSet()) // Infinite loop.
	return SE.getCouldNotCompute();

	// If the start is a non-zero constant, shift the range to simplify things.
	if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(getStart()))
	if (!SC->getValue()->isZero()) {
	SmallVector<const SCEV *, 4> Operands(op_begin(), op_end());
	Operands[0] = SE.getZero(SC->getType());
	const SCEV *Shifted = SE.getAddRecExpr(Operands, getLoop(),
	getNoWrapFlags(FlagNW));
	if (const auto *ShiftedAddRec = dyn_cast<SCEVAddRecExpr>(Shifted))
	return ShiftedAddRec->getNumIterationsInRange(
	Range.subtract(SC->getAPInt()), SE);
	// This is strange and shouldn't happen.
	return SE.getCouldNotCompute();
	}

	// The only time we can solve this is when we have all constant indices.
	// Otherwise, we cannot determine the overflow conditions.
	if (any_of(operands(), [](const SCEV *Op) { return !isa<SCEVConstant>(Op); }))
	return SE.getCouldNotCompute();

	// Okay at this point we know that all elements of the chrec are constants and
	// that the start element is zero.

	// First check to see if the range contains zero. If not, the first
	// iteration exits.
	unsigned BitWidth = SE.getTypeSizeInBits(getType());
	if (!Range.contains(APInt(BitWidth, 0)))
	return SE.getZero(getType());

	if (isAffine()) {
	// If this is an affine expression then we have this situation:
	// Solve {0,+,A} in Range === Ax in Range

	// We know that zero is in the range. If A is positive then we know that
	// the upper value of the range must be the first possible exit value.
	// If A is negative then the lower of the range is the last possible loop
	// value. Also note that we already checked for a full range.
	APInt A = cast<SCEVConstant>(getOperand(1))->getAPInt();
	APInt End = A.sge(1) ? (Range.getUpper() - 1) : Range.getLower();

	// The exit value should be (End+A)/A.
	APInt ExitVal = (End + A).udiv(A);
	ConstantInt *ExitValue = ConstantInt::get(SE.getContext(), ExitVal);

	// Evaluate at the exit value. If we really did fall out of the valid
	// range, then we computed our trip count, otherwise wrap around or other
	// things must have happened.
	ConstantInt *Val = EvaluateConstantChrecAtConstant(this, ExitValue, SE);
	if (Range.contains(Val->getValue()))
	return SE.getCouldNotCompute(); // Something strange happened

	// Ensure that the previous value is in the range. This is a sanity check.
	assert(Range.contains(
	EvaluateConstantChrecAtConstant(this,
	ConstantInt::get(SE.getContext(), ExitVal - 1), SE)->getValue()) &&
	"Linear scev computation is off in a bad way!");
	return SE.getConstant(ExitValue);
	} else if (isQuadratic()) {
	// If this is a quadratic (3-term) AddRec {L,+,M,+,N}, find the roots of the
	// quadratic equation to solve it. To do this, we must frame our problem in
	// terms of figuring out when zero is crossed, instead of when
	// Range.getUpper() is crossed.
	SmallVector<const SCEV *, 4> NewOps(op_begin(), op_end());
	NewOps[0] = SE.getNegativeSCEV(SE.getConstant(Range.getUpper()));
	const SCEV *NewAddRec = SE.getAddRecExpr(NewOps, getLoop(), FlagAnyWrap);

	// Next, solve the constructed addrec
	if (auto Roots =
	SolveQuadraticEquation(cast<SCEVAddRecExpr>(NewAddRec), SE)) {
	const SCEVConstant *R1 = Roots->first;
	const SCEVConstant *R2 = Roots->second;
	// Pick the smallest positive root value.
	if (ConstantInt *CB = dyn_cast<ConstantInt>(ConstantExpr::getICmp(
	ICmpInst::ICMP_ULT, R1->getValue(), R2->getValue()))) {
	if (!CB->getZExtValue())
	std::swap(R1, R2); // R1 is the minimum root now.

	// Make sure the root is not off by one. The returned iteration should
	// not be in the range, but the previous one should be. When solving
	// for "X*X < 5", for example, we should not return a root of 2.
	ConstantInt *R1Val =
	EvaluateConstantChrecAtConstant(this, R1->getValue(), SE);
	if (Range.contains(R1Val->getValue())) {
	// The next iteration must be out of the range...
	ConstantInt *NextVal =
	ConstantInt::get(SE.getContext(), R1->getAPInt() + 1);

	R1Val = EvaluateConstantChrecAtConstant(this, NextVal, SE);
	if (!Range.contains(R1Val->getValue()))
	return SE.getConstant(NextVal);
	return SE.getCouldNotCompute(); // Something strange happened
	}

	// If R1 was not in the range, then it is a good return value. Make
	// sure that R1-1 WAS in the range though, just in case.
	ConstantInt *NextVal =
	ConstantInt::get(SE.getContext(), R1->getAPInt() - 1);
	R1Val = EvaluateConstantChrecAtConstant(this, NextVal, SE);
	if (Range.contains(R1Val->getValue()))
	return R1;
	return SE.getCouldNotCompute(); // Something strange happened
	}
	}
	}

	return SE.getCouldNotCompute();
	}

	// Return true when S contains at least an undef value.
	static inline bool containsUndefs(const SCEV *S) {
	return SCEVExprContains(S, [](const SCEV *S) {
	if (const auto *SU = dyn_cast<SCEVUnknown>(S))
	return isa<UndefValue>(SU->getValue());
	else if (const auto *SC = dyn_cast<SCEVConstant>(S))
	return isa<UndefValue>(SC->getValue());
	return false;
	});
	}

	namespace {
	// Collect all steps of SCEV expressions.
	struct SCEVCollectStrides {
	ScalarEvolution &SE;
	SmallVectorImpl<const SCEV *> &Strides;

	SCEVCollectStrides(ScalarEvolution &SE, SmallVectorImpl<const SCEV *> &S)
	: SE(SE), Strides(S) {}

	bool follow(const SCEV *S) {
	if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S))
	Strides.push_back(AR->getStepRecurrence(SE));
	return true;
	}
	bool isDone() const { return false; }
	};

	// Collect all SCEVUnknown and SCEVMulExpr expressions.
	struct SCEVCollectTerms {
	SmallVectorImpl<const SCEV *> &Terms;

	SCEVCollectTerms(SmallVectorImpl<const SCEV *> &T)
	: Terms(T) {}

	bool follow(const SCEV *S) {
	if (isa<SCEVUnknown>(S) \|\| isa<SCEVMulExpr>(S) \|\|
	isa<SCEVSignExtendExpr>(S)) {
	if (!containsUndefs(S))
	Terms.push_back(S);

	// Stop recursion: once we collected a term, do not walk its operands.
	return false;
	}

	// Keep looking.
	return true;
	}
	bool isDone() const { return false; }
	};

	// Check if a SCEV contains an AddRecExpr.
	struct SCEVHasAddRec {
	bool &ContainsAddRec;

	SCEVHasAddRec(bool &ContainsAddRec) : ContainsAddRec(ContainsAddRec) {
	ContainsAddRec = false;
	}

	bool follow(const SCEV *S) {
	if (isa<SCEVAddRecExpr>(S)) {
	ContainsAddRec = true;

	// Stop recursion: once we collected a term, do not walk its operands.
	return false;
	}

	// Keep looking.
	return true;
	}
	bool isDone() const { return false; }
	};

	// Find factors that are multiplied with an expression that (possibly as a
	// subexpression) contains an AddRecExpr. In the expression:
	//
	// 8 * (100 + %p * %q * (%a + {0, +, 1}_loop))
	//
	// "%p * %q" are factors multiplied by the expression "(%a + {0, +, 1}_loop)"
	// that contains the AddRec {0, +, 1}_loop. %p * %q are likely to be array size
	// parameters as they form a product with an induction variable.
	//
	// This collector expects all array size parameters to be in the same MulExpr.
	// It might be necessary to later add support for collecting parameters that are
	// spread over different nested MulExpr.
	struct SCEVCollectAddRecMultiplies {
	SmallVectorImpl<const SCEV *> &Terms;
	ScalarEvolution &SE;

	SCEVCollectAddRecMultiplies(SmallVectorImpl<const SCEV *> &T, ScalarEvolution &SE)
	: Terms(T), SE(SE) {}

	bool follow(const SCEV *S) {
	if (auto *Mul = dyn_cast<SCEVMulExpr>(S)) {
	bool HasAddRec = false;
	SmallVector<const SCEV *, 0> Operands;
	for (auto Op : Mul->operands()) {
	const SCEVUnknown *Unknown = dyn_cast<SCEVUnknown>(Op);
	if (Unknown && !isa<CallInst>(Unknown->getValue())) {
	Operands.push_back(Op);
	} else if (Unknown) {
	HasAddRec = true;
	} else {
	bool ContainsAddRec;
	SCEVHasAddRec ContiansAddRec(ContainsAddRec);
	visitAll(Op, ContiansAddRec);
	HasAddRec \|= ContainsAddRec;
	}
	}
	if (Operands.size() == 0)
	return true;

	if (!HasAddRec)
	return false;

	Terms.push_back(SE.getMulExpr(Operands));
	// Stop recursion: once we collected a term, do not walk its operands.
	return false;
	}

	// Keep looking.
	return true;
	}
	bool isDone() const { return false; }
	};
	}

	/// Find parametric terms in this SCEVAddRecExpr. We first for parameters in
	/// two places:
	/// 1) The strides of AddRec expressions.
	/// 2) Unknowns that are multiplied with AddRec expressions.
	void ScalarEvolution::collectParametricTerms(const SCEV *Expr,
	SmallVectorImpl<const SCEV *> &Terms) {
	SmallVector<const SCEV *, 4> Strides;
	SCEVCollectStrides StrideCollector(*this, Strides);
	visitAll(Expr, StrideCollector);

	DEBUG({
	dbgs() << "Strides:\n";
	for (const SCEV *S : Strides)
	dbgs() << *S << "\n";
	});

	for (const SCEV *S : Strides) {
	SCEVCollectTerms TermCollector(Terms);
	visitAll(S, TermCollector);
	}

	DEBUG({
	dbgs() << "Terms:\n";
	for (const SCEV *T : Terms)
	dbgs() << *T << "\n";
	});

	SCEVCollectAddRecMultiplies MulCollector(Terms, *this);
	visitAll(Expr, MulCollector);
	}

	static bool findArrayDimensionsRec(ScalarEvolution &SE,
	SmallVectorImpl<const SCEV *> &Terms,
	SmallVectorImpl<const SCEV *> &Sizes) {
	int Last = Terms.size() - 1;
	const SCEV *Step = Terms[Last];

	// End of recursion.
	if (Last == 0) {
	if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(Step)) {
	SmallVector<const SCEV *, 2> Qs;
	for (const SCEV *Op : M->operands())
	if (!isa<SCEVConstant>(Op))
	Qs.push_back(Op);

	Step = SE.getMulExpr(Qs);
	}

	Sizes.push_back(Step);
	return true;
	}

	for (const SCEV *&Term : Terms) {
	// Normalize the terms before the next call to findArrayDimensionsRec.
	const SCEV Q, R;
	SCEVDivision::divide(SE, Term, Step, &Q, &R);

	// Bail out when GCD does not evenly divide one of the terms.
	if (!R->isZero())
	return false;

	Term = Q;
	}

	// Remove all SCEVConstants.
	Terms.erase(
	remove_if(Terms, [](const SCEV *E) { return isa<SCEVConstant>(E); }),
	Terms.end());

	if (Terms.size() > 0)
	if (!findArrayDimensionsRec(SE, Terms, Sizes))
	return false;

	Sizes.push_back(Step);
	return true;
	}


	// Returns true when one of the SCEVs of Terms contains a SCEVUnknown parameter.
	static inline bool containsParameters(SmallVectorImpl<const SCEV *> &Terms) {
	for (const SCEV *T : Terms)
	if (SCEVExprContains(T, isa<SCEVUnknown, const SCEV *>))
	return true;
	return false;
	}

	// Return the number of product terms in S.
	static inline int numberOfTerms(const SCEV *S) {
	if (const SCEVMulExpr *Expr = dyn_cast<SCEVMulExpr>(S))
	return Expr->getNumOperands();
	return 1;
	}

	static const SCEV removeConstantFactors(ScalarEvolution &SE, const SCEV T) {
	if (isa<SCEVConstant>(T))
	return nullptr;

	if (isa<SCEVUnknown>(T))
	return T;

	if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(T)) {
	SmallVector<const SCEV *, 2> Factors;
	for (const SCEV *Op : M->operands())
	if (!isa<SCEVConstant>(Op))
	Factors.push_back(Op);

	return SE.getMulExpr(Factors);
	}

	return T;
	}

	/// Return the size of an element read or written by Inst.
	const SCEV ScalarEvolution::getElementSize(Instruction Inst) {
	Type *Ty;
	if (StoreInst *Store = dyn_cast<StoreInst>(Inst))
	Ty = Store->getValueOperand()->getType();
	else if (LoadInst *Load = dyn_cast<LoadInst>(Inst))
	Ty = Load->getType();
	else
	return nullptr;

	Type *ETy = getEffectiveSCEVType(PointerType::getUnqual(Ty));
	return getSizeOfExpr(ETy, Ty);
	}

	void ScalarEvolution::findArrayDimensions(SmallVectorImpl<const SCEV *> &Terms,
	SmallVectorImpl<const SCEV *> &Sizes,
	const SCEV *ElementSize) {
	if (Terms.size() < 1 \|\| !ElementSize)
	return;

	// Early return when Terms do not contain parameters: we do not delinearize
	// non parametric SCEVs.
	if (!containsParameters(Terms))
	return;

	DEBUG({
	dbgs() << "Terms:\n";
	for (const SCEV *T : Terms)
	dbgs() << *T << "\n";
	});

	// Remove duplicates.
	array_pod_sort(Terms.begin(), Terms.end());
	Terms.erase(std::unique(Terms.begin(), Terms.end()), Terms.end());

	// Put larger terms first.
	std::sort(Terms.begin(), Terms.end(), [](const SCEV LHS, const SCEV RHS) {
	return numberOfTerms(LHS) > numberOfTerms(RHS);
	});

	// Try to divide all terms by the element size. If term is not divisible by
	// element size, proceed with the original term.
	for (const SCEV *&Term : Terms) {
	const SCEV Q, R;
	SCEVDivision::divide(*this, Term, ElementSize, &Q, &R);
	if (!Q->isZero())
	Term = Q;
	}

	SmallVector<const SCEV *, 4> NewTerms;

	// Remove constant factors.
	for (const SCEV *T : Terms)
	if (const SCEV NewT = removeConstantFactors(this, T))
	NewTerms.push_back(NewT);

	DEBUG({
	dbgs() << "Terms after sorting:\n";
	for (const SCEV *T : NewTerms)
	dbgs() << *T << "\n";
	});

	if (NewTerms.empty() \|\| !findArrayDimensionsRec(*this, NewTerms, Sizes)) {
	Sizes.clear();
	return;
	}

	// The last element to be pushed into Sizes is the size of an element.
	Sizes.push_back(ElementSize);

	DEBUG({
	dbgs() << "Sizes:\n";
	for (const SCEV *S : Sizes)
	dbgs() << *S << "\n";
	});
	}

	void ScalarEvolution::computeAccessFunctions(
	const SCEV Expr, SmallVectorImpl<const SCEV > &Subscripts,
	SmallVectorImpl<const SCEV *> &Sizes) {

	// Early exit in case this SCEV is not an affine multivariate function.
	if (Sizes.empty())
	return;

	if (auto *AR = dyn_cast<SCEVAddRecExpr>(Expr))
	if (!AR->isAffine())
	return;

	const SCEV *Res = Expr;
	int Last = Sizes.size() - 1;
	for (int i = Last; i >= 0; i--) {
	const SCEV Q, R;
	SCEVDivision::divide(*this, Res, Sizes[i], &Q, &R);

	DEBUG({
	dbgs() << "Res: " << *Res << "\n";
	dbgs() << "Sizes[i]: " << *Sizes[i] << "\n";
	dbgs() << "Res divided by Sizes[i]:\n";
	dbgs() << "Quotient: " << *Q << "\n";
	dbgs() << "Remainder: " << *R << "\n";
	});

	Res = Q;

	// Do not record the last subscript corresponding to the size of elements in
	// the array.
	if (i == Last) {

	// Bail out if the remainder is too complex.
	if (isa<SCEVAddRecExpr>(R)) {
	Subscripts.clear();
	Sizes.clear();
	return;
	}

	continue;
	}

	// Record the access function for the current subscript.
	Subscripts.push_back(R);
	}

	// Also push in last position the remainder of the last division: it will be
	// the access function of the innermost dimension.
	Subscripts.push_back(Res);

	std::reverse(Subscripts.begin(), Subscripts.end());

	DEBUG({
	dbgs() << "Subscripts:\n";
	for (const SCEV *S : Subscripts)
	dbgs() << *S << "\n";
	});
	}

	/// Splits the SCEV into two vectors of SCEVs representing the subscripts and
	/// sizes of an array access. Returns the remainder of the delinearization that
	/// is the offset start of the array. The SCEV->delinearize algorithm computes
	/// the multiples of SCEV coefficients: that is a pattern matching of sub
	/// expressions in the stride and base of a SCEV corresponding to the
	/// computation of a GCD (greatest common divisor) of base and stride. When
	/// SCEV->delinearize fails, it returns the SCEV unchanged.
	///
	/// For example: when analyzing the memory access A[i][j][k] in this loop nest
	///
	/// void foo(long n, long m, long o, double A[n][m][o]) {
	///
	/// for (long i = 0; i < n; i++)
	/// for (long j = 0; j < m; j++)
	/// for (long k = 0; k < o; k++)
	/// A[i][j][k] = 1.0;
	/// }
	///
	/// the delinearization input is the following AddRec SCEV:
	///
	/// AddRec: {{{%A,+,(8 * %m * %o)}<%for.i>,+,(8 * %o)}<%for.j>,+,8}<%for.k>
	///
	/// From this SCEV, we are able to say that the base offset of the access is %A
	/// because it appears as an offset that does not divide any of the strides in
	/// the loops:
	///
	/// CHECK: Base offset: %A
	///
	/// and then SCEV->delinearize determines the size of some of the dimensions of
	/// the array as these are the multiples by which the strides are happening:
	///
	/// CHECK: ArrayDecl[UnknownSize][%m][%o] with elements of sizeof(double) bytes.
	///
	/// Note that the outermost dimension remains of UnknownSize because there are
	/// no strides that would help identifying the size of the last dimension: when
	/// the array has been statically allocated, one could compute the size of that
	/// dimension by dividing the overall size of the array by the size of the known
	/// dimensions: %m * %o * 8.
	///
	/// Finally delinearize provides the access functions for the array reference
	/// that does correspond to A[i][j][k] of the above C testcase:
	///
	/// CHECK: ArrayRef[{0,+,1}<%for.i>][{0,+,1}<%for.j>][{0,+,1}<%for.k>]
	///
	/// The testcases are checking the output of a function pass:
	/// DelinearizationPass that walks through all loads and stores of a function
	/// asking for the SCEV of the memory access with respect to all enclosing
	/// loops, calling SCEV->delinearize on that and printing the results.

	void ScalarEvolution::delinearize(const SCEV *Expr,
	SmallVectorImpl<const SCEV *> &Subscripts,
	SmallVectorImpl<const SCEV *> &Sizes,
	const SCEV *ElementSize) {
	// First step: collect parametric terms.
	SmallVector<const SCEV *, 4> Terms;
	collectParametricTerms(Expr, Terms);

	if (Terms.empty())
	return;

	// Second step: find subscript sizes.
	findArrayDimensions(Terms, Sizes, ElementSize);

	if (Sizes.empty())
	return;

	// Third step: compute the access functions for each subscript.
	computeAccessFunctions(Expr, Subscripts, Sizes);

	if (Subscripts.empty())
	return;

	DEBUG({
	dbgs() << "succeeded to delinearize " << *Expr << "\n";
	dbgs() << "ArrayDecl[UnknownSize]";
	for (const SCEV *S : Sizes)
	dbgs() << "[" << *S << "]";

	dbgs() << "\nArrayRef";
	for (const SCEV *S : Subscripts)
	dbgs() << "[" << *S << "]";
	dbgs() << "\n";
	});
	}

	//===----------------------------------------------------------------------===//
	// SCEVCallbackVH Class Implementation
	//===----------------------------------------------------------------------===//

	void ScalarEvolution::SCEVCallbackVH::deleted() {
	assert(SE && "SCEVCallbackVH called with a null ScalarEvolution!");
	if (PHINode *PN = dyn_cast<PHINode>(getValPtr()))
	SE->ConstantEvolutionLoopExitValue.erase(PN);
	SE->eraseValueFromMap(getValPtr());
	// this now dangles!
	}

	void ScalarEvolution::SCEVCallbackVH::allUsesReplacedWith(Value *V) {
	assert(SE && "SCEVCallbackVH called with a null ScalarEvolution!");

	// Forget all the expressions associated with users of the old value,
	// so that future queries will recompute the expressions using the new
	// value.
	Value *Old = getValPtr();
	SmallVector<User *, 16> Worklist(Old->user_begin(), Old->user_end());
	SmallPtrSet<User *, 8> Visited;
	while (!Worklist.empty()) {
	User *U = Worklist.pop_back_val();
	// Deleting the Old value will cause this to dangle. Postpone
	// that until everything else is done.
	if (U == Old)
	continue;
	if (!Visited.insert(U).second)
	continue;
	if (PHINode *PN = dyn_cast<PHINode>(U))
	SE->ConstantEvolutionLoopExitValue.erase(PN);
	SE->eraseValueFromMap(U);
	Worklist.insert(Worklist.end(), U->user_begin(), U->user_end());
	}
	// Delete the Old value.
	if (PHINode *PN = dyn_cast<PHINode>(Old))
	SE->ConstantEvolutionLoopExitValue.erase(PN);
	SE->eraseValueFromMap(Old);
	// this now dangles!
	}

	ScalarEvolution::SCEVCallbackVH::SCEVCallbackVH(Value V, ScalarEvolution se)
	: CallbackVH(V), SE(se) {}

	//===----------------------------------------------------------------------===//
	// ScalarEvolution Class Implementation
	//===----------------------------------------------------------------------===//

	ScalarEvolution::ScalarEvolution(Function &F, TargetLibraryInfo &TLI,
	AssumptionCache &AC, DominatorTree &DT,
	LoopInfo &LI)
	: F(F), TLI(TLI), AC(AC), DT(DT), LI(LI),
	CouldNotCompute(new SCEVCouldNotCompute()),
	WalkingBEDominatingConds(false), ProvingSplitPredicate(false),
	ValuesAtScopes(64), LoopDispositions(64), BlockDispositions(64),
	FirstUnknown(nullptr) {

	// To use guards for proving predicates, we need to scan every instruction in
	// relevant basic blocks, and not just terminators. Doing this is a waste of
	// time if the IR does not actually contain any calls to
	// @llvm.experimental.guard, so do a quick check and remember this beforehand.
	//
	// This pessimizes the case where a pass that preserves ScalarEvolution wants
	// to _add_ guards to the module when there weren't any before, and wants
	// ScalarEvolution to optimize based on those guards. For now we prefer to be
	// efficient in lieu of being smart in that rather obscure case.

	auto *GuardDecl = F.getParent()->getFunction(
	Intrinsic::getName(Intrinsic::experimental_guard));
	HasGuards = GuardDecl && !GuardDecl->use_empty();
	}

	ScalarEvolution::ScalarEvolution(ScalarEvolution &&Arg)
	: F(Arg.F), HasGuards(Arg.HasGuards), TLI(Arg.TLI), AC(Arg.AC), DT(Arg.DT),
	LI(Arg.LI), CouldNotCompute(std::move(Arg.CouldNotCompute)),
	ValueExprMap(std::move(Arg.ValueExprMap)),
	PendingLoopPredicates(std::move(Arg.PendingLoopPredicates)),
	WalkingBEDominatingConds(false), ProvingSplitPredicate(false),
	MinTrailingZerosCache(std::move(Arg.MinTrailingZerosCache)),
	BackedgeTakenCounts(std::move(Arg.BackedgeTakenCounts)),
	PredicatedBackedgeTakenCounts(
	std::move(Arg.PredicatedBackedgeTakenCounts)),
	ConstantEvolutionLoopExitValue(
	std::move(Arg.ConstantEvolutionLoopExitValue)),
	ValuesAtScopes(std::move(Arg.ValuesAtScopes)),
	LoopDispositions(std::move(Arg.LoopDispositions)),
	LoopPropertiesCache(std::move(Arg.LoopPropertiesCache)),
	BlockDispositions(std::move(Arg.BlockDispositions)),
	UnsignedRanges(std::move(Arg.UnsignedRanges)),
	SignedRanges(std::move(Arg.SignedRanges)),
	UniqueSCEVs(std::move(Arg.UniqueSCEVs)),
	UniquePreds(std::move(Arg.UniquePreds)),
	SCEVAllocator(std::move(Arg.SCEVAllocator)),
	PredicatedSCEVRewrites(std::move(Arg.PredicatedSCEVRewrites)),
	FirstUnknown(Arg.FirstUnknown) {
	Arg.FirstUnknown = nullptr;
	}

	ScalarEvolution::~ScalarEvolution() {
	// Iterate through all the SCEVUnknown instances and call their
	// destructors, so that they release their references to their values.
	for (SCEVUnknown *U = FirstUnknown; U;) {
	SCEVUnknown *Tmp = U;
	U = U->Next;
	Tmp->~SCEVUnknown();
	}
	FirstUnknown = nullptr;

	ExprValueMap.clear();
	ValueExprMap.clear();
	HasRecMap.clear();

	// Free any extra memory created for ExitNotTakenInfo in the unlikely event
	// that a loop had multiple computable exits.
	for (auto &BTCI : BackedgeTakenCounts)
	BTCI.second.clear();
	for (auto &BTCI : PredicatedBackedgeTakenCounts)
	BTCI.second.clear();

	assert(PendingLoopPredicates.empty() && "isImpliedCond garbage");
	assert(!WalkingBEDominatingConds && "isLoopBackedgeGuardedByCond garbage!");
	assert(!ProvingSplitPredicate && "ProvingSplitPredicate garbage!");
	}

	bool ScalarEvolution::hasLoopInvariantBackedgeTakenCount(const Loop *L) {
	return !isa<SCEVCouldNotCompute>(getBackedgeTakenCount(L));
	}

	static void PrintLoopInfo(raw_ostream &OS, ScalarEvolution *SE,
	const Loop *L) {
	// Print all inner loops first
	for (Loop I : L)
	PrintLoopInfo(OS, SE, I);

	OS << "Loop ";
	L->getHeader()->printAsOperand(OS, /PrintType=/false);
	OS << ": ";

	SmallVector<BasicBlock *, 8> ExitBlocks;
	L->getExitBlocks(ExitBlocks);
	if (ExitBlocks.size() != 1)
	OS << "<multiple exits> ";

	if (SE->hasLoopInvariantBackedgeTakenCount(L)) {
	OS << "backedge-taken count is " << *SE->getBackedgeTakenCount(L);
	} else {
	OS << "Unpredictable backedge-taken count. ";
	}

	OS << "\n"
	"Loop ";
	L->getHeader()->printAsOperand(OS, /PrintType=/false);
	OS << ": ";

	if (!isa<SCEVCouldNotCompute>(SE->getMaxBackedgeTakenCount(L))) {
	OS << "max backedge-taken count is " << *SE->getMaxBackedgeTakenCount(L);
	if (SE->isBackedgeTakenCountMaxOrZero(L))
	OS << ", actual taken count either this or zero.";
	} else {
	OS << "Unpredictable max backedge-taken count. ";
	}

	OS << "\n"
	"Loop ";
	L->getHeader()->printAsOperand(OS, /PrintType=/false);
	OS << ": ";

	SCEVUnionPredicate Pred;
	auto PBT = SE->getPredicatedBackedgeTakenCount(L, Pred);
	if (!isa<SCEVCouldNotCompute>(PBT)) {
	OS << "Predicated backedge-taken count is " << *PBT << "\n";
	OS << " Predicates:\n";
	Pred.print(OS, 4);
	} else {
	OS << "Unpredictable predicated backedge-taken count. ";
	}
	OS << "\n";

	if (SE->hasLoopInvariantBackedgeTakenCount(L)) {
	OS << "Loop ";
	L->getHeader()->printAsOperand(OS, /PrintType=/false);
	OS << ": ";
	OS << "Trip multiple is " << SE->getSmallConstantTripMultiple(L) << "\n";
	}
	}

	static StringRef loopDispositionToStr(ScalarEvolution::LoopDisposition LD) {
	switch (LD) {
	case ScalarEvolution::LoopVariant:
	return "Variant";
	case ScalarEvolution::LoopInvariant:
	return "Invariant";
	case ScalarEvolution::LoopComputable:
	return "Computable";
	}
	llvm_unreachable("Unknown ScalarEvolution::LoopDisposition kind!");
	}

	void ScalarEvolution::print(raw_ostream &OS) const {
	// ScalarEvolution's implementation of the print method is to print
	// out SCEV values of all instructions that are interesting. Doing
	// this potentially causes it to create new SCEV objects though,
	// which technically conflicts with the const qualifier. This isn't
	// observable from outside the class though, so casting away the
	// const isn't dangerous.
	ScalarEvolution &SE = const_cast<ScalarEvolution >(this);

	OS << "Classifying expressions for: ";
	F.printAsOperand(OS, /PrintType=/false);
	OS << "\n";
	for (Instruction &I : instructions(F))
	if (isSCEVable(I.getType()) && !isa<CmpInst>(I)) {
	OS << I << '\n';
	OS << " --> ";
	const SCEV *SV = SE.getSCEV(&I);
	SV->print(OS);
	if (!isa<SCEVCouldNotCompute>(SV)) {
	OS << " U: ";
	SE.getUnsignedRange(SV).print(OS);
	OS << " S: ";
	SE.getSignedRange(SV).print(OS);
	}

	const Loop *L = LI.getLoopFor(I.getParent());

	const SCEV *AtUse = SE.getSCEVAtScope(SV, L);
	if (AtUse != SV) {
	OS << " --> ";
	AtUse->print(OS);
	if (!isa<SCEVCouldNotCompute>(AtUse)) {
	OS << " U: ";
	SE.getUnsignedRange(AtUse).print(OS);
	OS << " S: ";
	SE.getSignedRange(AtUse).print(OS);
	}
	}

	if (L) {
	OS << "\t\t" "Exits: ";
	const SCEV *ExitValue = SE.getSCEVAtScope(SV, L->getParentLoop());
	if (!SE.isLoopInvariant(ExitValue, L)) {
	OS << "<<Unknown>>";
	} else {
	OS << *ExitValue;
	}

	bool First = true;
	for (auto *Iter = L; Iter; Iter = Iter->getParentLoop()) {
	if (First) {
	OS << "\t\t" "LoopDispositions: { ";
	First = false;
	} else {
	OS << ", ";
	}

	Iter->getHeader()->printAsOperand(OS, /PrintType=/false);
	OS << ": " << loopDispositionToStr(SE.getLoopDisposition(SV, Iter));
	}

	for (auto *InnerL : depth_first(L)) {
	if (InnerL == L)
	continue;
	if (First) {
	OS << "\t\t" "LoopDispositions: { ";
	First = false;
	} else {
	OS << ", ";
	}

	InnerL->getHeader()->printAsOperand(OS, /PrintType=/false);
	OS << ": " << loopDispositionToStr(SE.getLoopDisposition(SV, InnerL));
	}

	OS << " }";
	}

	OS << "\n";
	}

	OS << "Determining loop execution counts for: ";
	F.printAsOperand(OS, /PrintType=/false);
	OS << "\n";
	for (Loop *I : LI)
	PrintLoopInfo(OS, &SE, I);
	}

	ScalarEvolution::LoopDisposition
	ScalarEvolution::getLoopDisposition(const SCEV S, const Loop L) {
	auto &Values = LoopDispositions[S];
	for (auto &V : Values) {
	if (V.getPointer() == L)
	return V.getInt();
	}
	Values.emplace_back(L, LoopVariant);
	LoopDisposition D = computeLoopDisposition(S, L);
	auto &Values2 = LoopDispositions[S];
	for (auto &V : make_range(Values2.rbegin(), Values2.rend())) {
	if (V.getPointer() == L) {
	V.setInt(D);
	break;
	}
	}
	return D;
	}

	ScalarEvolution::LoopDisposition
	ScalarEvolution::computeLoopDisposition(const SCEV S, const Loop L) {
	switch (static_cast<SCEVTypes>(S->getSCEVType())) {
	case scConstant:
	return LoopInvariant;
	case scTruncate:
	case scZeroExtend:
	case scSignExtend:
	return getLoopDisposition(cast<SCEVCastExpr>(S)->getOperand(), L);
	case scAddRecExpr: {
	const SCEVAddRecExpr *AR = cast<SCEVAddRecExpr>(S);

	// If L is the addrec's loop, it's computable.
	if (AR->getLoop() == L)
	return LoopComputable;

	// Add recurrences are never invariant in the function-body (null loop).
	if (!L)
	return LoopVariant;

	// This recurrence is variant w.r.t. L if L contains AR's loop.
	if (L->contains(AR->getLoop()))
	return LoopVariant;

	// This recurrence is invariant w.r.t. L if AR's loop contains L.
	if (AR->getLoop()->contains(L))
	return LoopInvariant;

	// This recurrence is variant w.r.t. L if any of its operands
	// are variant.
	for (auto *Op : AR->operands())
	if (!isLoopInvariant(Op, L))
	return LoopVariant;

	// Otherwise it's loop-invariant.
	return LoopInvariant;
	}
	case scAddExpr:
	case scMulExpr:
	case scUMaxExpr:
	case scSMaxExpr: {
	bool HasVarying = false;
	for (auto *Op : cast<SCEVNAryExpr>(S)->operands()) {
	LoopDisposition D = getLoopDisposition(Op, L);
	if (D == LoopVariant)
	return LoopVariant;
	if (D == LoopComputable)
	HasVarying = true;
	}
	return HasVarying ? LoopComputable : LoopInvariant;
	}
	case scUDivExpr: {
	const SCEVUDivExpr *UDiv = cast<SCEVUDivExpr>(S);
	LoopDisposition LD = getLoopDisposition(UDiv->getLHS(), L);
	if (LD == LoopVariant)
	return LoopVariant;
	LoopDisposition RD = getLoopDisposition(UDiv->getRHS(), L);
	if (RD == LoopVariant)
	return LoopVariant;
	return (LD == LoopInvariant && RD == LoopInvariant) ?
	LoopInvariant : LoopComputable;
	}
	case scUnknown:
	// All non-instruction values are loop invariant. All instructions are loop
	// invariant if they are not contained in the specified loop.
	// Instructions are never considered invariant in the function body
	// (null loop) because they are defined within the "loop".
	if (auto *I = dyn_cast<Instruction>(cast<SCEVUnknown>(S)->getValue()))
	return (L && !L->contains(I)) ? LoopInvariant : LoopVariant;
	return LoopInvariant;
	case scCouldNotCompute:
	llvm_unreachable("Attempt to use a SCEVCouldNotCompute object!");
	}
	llvm_unreachable("Unknown SCEV kind!");
	}

	bool ScalarEvolution::isLoopInvariant(const SCEV S, const Loop L) {
	return getLoopDisposition(S, L) == LoopInvariant;
	}

	bool ScalarEvolution::hasComputableLoopEvolution(const SCEV S, const Loop L) {
	return getLoopDisposition(S, L) == LoopComputable;
	}

	ScalarEvolution::BlockDisposition
	ScalarEvolution::getBlockDisposition(const SCEV S, const BasicBlock BB) {
	auto &Values = BlockDispositions[S];
	for (auto &V : Values) {
	if (V.getPointer() == BB)
	return V.getInt();
	}
	Values.emplace_back(BB, DoesNotDominateBlock);
	BlockDisposition D = computeBlockDisposition(S, BB);
	auto &Values2 = BlockDispositions[S];
	for (auto &V : make_range(Values2.rbegin(), Values2.rend())) {
	if (V.getPointer() == BB) {
	V.setInt(D);
	break;
	}
	}
	return D;
	}

	ScalarEvolution::BlockDisposition
	ScalarEvolution::computeBlockDisposition(const SCEV S, const BasicBlock BB) {
	switch (static_cast<SCEVTypes>(S->getSCEVType())) {
	case scConstant:
	return ProperlyDominatesBlock;
	case scTruncate:
	case scZeroExtend:
	case scSignExtend:
	return getBlockDisposition(cast<SCEVCastExpr>(S)->getOperand(), BB);
	case scAddRecExpr: {
	// This uses a "dominates" query instead of "properly dominates" query
	// to test for proper dominance too, because the instruction which
	// produces the addrec's value is a PHI, and a PHI effectively properly
	// dominates its entire containing block.
	const SCEVAddRecExpr *AR = cast<SCEVAddRecExpr>(S);
	if (!DT.dominates(AR->getLoop()->getHeader(), BB))
	return DoesNotDominateBlock;

	// Fall through into SCEVNAryExpr handling.
	LLVM_FALLTHROUGH;
	}
	case scAddExpr:
	case scMulExpr:
	case scUMaxExpr:
	case scSMaxExpr: {
	const SCEVNAryExpr *NAry = cast<SCEVNAryExpr>(S);
	bool Proper = true;
	for (const SCEV *NAryOp : NAry->operands()) {
	BlockDisposition D = getBlockDisposition(NAryOp, BB);
	if (D == DoesNotDominateBlock)
	return DoesNotDominateBlock;
	if (D == DominatesBlock)
	Proper = false;
	}
	return Proper ? ProperlyDominatesBlock : DominatesBlock;
	}
	case scUDivExpr: {
	const SCEVUDivExpr *UDiv = cast<SCEVUDivExpr>(S);
	const SCEV LHS = UDiv->getLHS(), RHS = UDiv->getRHS();
	BlockDisposition LD = getBlockDisposition(LHS, BB);
	if (LD == DoesNotDominateBlock)
	return DoesNotDominateBlock;
	BlockDisposition RD = getBlockDisposition(RHS, BB);
	if (RD == DoesNotDominateBlock)
	return DoesNotDominateBlock;
	return (LD == ProperlyDominatesBlock && RD == ProperlyDominatesBlock) ?
	ProperlyDominatesBlock : DominatesBlock;
	}
	case scUnknown:
	if (Instruction *I =
	dyn_cast<Instruction>(cast<SCEVUnknown>(S)->getValue())) {
	if (I->getParent() == BB)
	return DominatesBlock;
	if (DT.properlyDominates(I->getParent(), BB))
	return ProperlyDominatesBlock;
	return DoesNotDominateBlock;
	}
	return ProperlyDominatesBlock;
	case scCouldNotCompute:
	llvm_unreachable("Attempt to use a SCEVCouldNotCompute object!");
	}
	llvm_unreachable("Unknown SCEV kind!");
	}

	bool ScalarEvolution::dominates(const SCEV S, const BasicBlock BB) {
	return getBlockDisposition(S, BB) >= DominatesBlock;
	}

	bool ScalarEvolution::properlyDominates(const SCEV S, const BasicBlock BB) {
	return getBlockDisposition(S, BB) == ProperlyDominatesBlock;
	}

	bool ScalarEvolution::hasOperand(const SCEV S, const SCEV Op) const {
	return SCEVExprContains(S, [&](const SCEV *Expr) { return Expr == Op; });
	}

	void ScalarEvolution::forgetMemoizedResults(const SCEV *S) {
	ValuesAtScopes.erase(S);
	LoopDispositions.erase(S);
	BlockDispositions.erase(S);
	UnsignedRanges.erase(S);
	SignedRanges.erase(S);
	ExprValueMap.erase(S);
	HasRecMap.erase(S);
	MinTrailingZerosCache.erase(S);

	for (auto I = PredicatedSCEVRewrites.begin();
	I != PredicatedSCEVRewrites.end();) {
	std::pair<const SCEV , const Loop > Entry = I->first;
	if (Entry.first == S)
	PredicatedSCEVRewrites.erase(I++);
	else
	++I;
	}

	auto RemoveSCEVFromBackedgeMap =
	[S, this](DenseMap<const Loop *, BackedgeTakenInfo> &Map) {
	for (auto I = Map.begin(), E = Map.end(); I != E;) {
	BackedgeTakenInfo &BEInfo = I->second;
	if (BEInfo.hasOperand(S, this)) {
	BEInfo.clear();
	Map.erase(I++);
	} else
	++I;
	}
	};

	RemoveSCEVFromBackedgeMap(BackedgeTakenCounts);
	RemoveSCEVFromBackedgeMap(PredicatedBackedgeTakenCounts);
	}

	void ScalarEvolution::verify() const {
	ScalarEvolution &SE = const_cast<ScalarEvolution >(this);
	ScalarEvolution SE2(F, TLI, AC, DT, LI);

	SmallVector<Loop *, 8> LoopStack(LI.begin(), LI.end());

	// Map's SCEV expressions from one ScalarEvolution "universe" to another.
	struct SCEVMapper : public SCEVRewriteVisitor<SCEVMapper> {
	const SCEV visitConstant(const SCEVConstant Constant) {
	return SE.getConstant(Constant->getAPInt());
	}
	const SCEV visitUnknown(const SCEVUnknown Expr) {
	return SE.getUnknown(Expr->getValue());
	}

	const SCEV visitCouldNotCompute(const SCEVCouldNotCompute Expr) {
	return SE.getCouldNotCompute();
	}
	SCEVMapper(ScalarEvolution &SE) : SCEVRewriteVisitor<SCEVMapper>(SE) {}
	};

	SCEVMapper SCM(SE2);

	while (!LoopStack.empty()) {
	auto *L = LoopStack.pop_back_val();
	LoopStack.insert(LoopStack.end(), L->begin(), L->end());

	auto *CurBECount = SCM.visit(
	const_cast<ScalarEvolution *>(this)->getBackedgeTakenCount(L));
	auto *NewBECount = SE2.getBackedgeTakenCount(L);

	if (CurBECount == SE2.getCouldNotCompute() \|\|
	NewBECount == SE2.getCouldNotCompute()) {
	// NB! This situation is legal, but is very suspicious -- whatever pass
	// change the loop to make a trip count go from could not compute to
	// computable or vice-versa should have invalidated SCEV. However, we
	// choose not to assert here (for now) since we don't want false
	// positives.
	continue;
	}

	if (containsUndefs(CurBECount) \|\| containsUndefs(NewBECount)) {
	// SCEV treats "undef" as an unknown but consistent value (i.e. it does
	// not propagate undef aggressively). This means we can (and do) fail
	// verification in cases where a transform makes the trip count of a loop
	// go from "undef" to "undef+1" (say). The transform is fine, since in
	// both cases the loop iterates "undef" times, but SCEV thinks we
	// increased the trip count of the loop by 1 incorrectly.
	continue;
	}

	if (SE.getTypeSizeInBits(CurBECount->getType()) >
	SE.getTypeSizeInBits(NewBECount->getType()))
	NewBECount = SE2.getZeroExtendExpr(NewBECount, CurBECount->getType());
	else if (SE.getTypeSizeInBits(CurBECount->getType()) <
	SE.getTypeSizeInBits(NewBECount->getType()))
	CurBECount = SE2.getZeroExtendExpr(CurBECount, NewBECount->getType());

	auto *ConstantDelta =
	dyn_cast<SCEVConstant>(SE2.getMinusSCEV(CurBECount, NewBECount));

	if (ConstantDelta && ConstantDelta->getAPInt() != 0) {
	dbgs() << "Trip Count Changed!\n";
	dbgs() << "Old: " << *CurBECount << "\n";
	dbgs() << "New: " << *NewBECount << "\n";
	dbgs() << "Delta: " << *ConstantDelta << "\n";
	std::abort();
	}
	}
	}

	bool ScalarEvolution::invalidate(
	Function &F, const PreservedAnalyses &PA,
	FunctionAnalysisManager::Invalidator &Inv) {
	// Invalidate the ScalarEvolution object whenever it isn't preserved or one
	// of its dependencies is invalidated.
	auto PAC = PA.getChecker<ScalarEvolutionAnalysis>();
	return !(PAC.preserved() \|\| PAC.preservedSet<AllAnalysesOn<Function>>()) \|\|
	Inv.invalidate<AssumptionAnalysis>(F, PA) \|\|
	Inv.invalidate<DominatorTreeAnalysis>(F, PA) \|\|
	Inv.invalidate<LoopAnalysis>(F, PA);
	}

	AnalysisKey ScalarEvolutionAnalysis::Key;

	ScalarEvolution ScalarEvolutionAnalysis::run(Function &F,
	FunctionAnalysisManager &AM) {
	return ScalarEvolution(F, AM.getResult<TargetLibraryAnalysis>(F),
	AM.getResult<AssumptionAnalysis>(F),
	AM.getResult<DominatorTreeAnalysis>(F),
	AM.getResult<LoopAnalysis>(F));
	}

	PreservedAnalyses
	ScalarEvolutionPrinterPass::run(Function &F, FunctionAnalysisManager &AM) {
	AM.getResult<ScalarEvolutionAnalysis>(F).print(OS);
	return PreservedAnalyses::all();
	}

	INITIALIZE_PASS_BEGIN(ScalarEvolutionWrapperPass, "scalar-evolution",
	"Scalar Evolution Analysis", false, true)
	INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
	INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
	INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
	INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
	INITIALIZE_PASS_END(ScalarEvolutionWrapperPass, "scalar-evolution",
	"Scalar Evolution Analysis", false, true)
	char ScalarEvolutionWrapperPass::ID = 0;

	ScalarEvolutionWrapperPass::ScalarEvolutionWrapperPass() : FunctionPass(ID) {
	initializeScalarEvolutionWrapperPassPass(*PassRegistry::getPassRegistry());
	}

	bool ScalarEvolutionWrapperPass::runOnFunction(Function &F) {
	SE.reset(new ScalarEvolution(
	F, getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(),
	getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F),
	getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
	getAnalysis<LoopInfoWrapperPass>().getLoopInfo()));
	return false;
	}

	void ScalarEvolutionWrapperPass::releaseMemory() { SE.reset(); }

	void ScalarEvolutionWrapperPass::print(raw_ostream &OS, const Module *) const {
	SE->print(OS);
	}

	void ScalarEvolutionWrapperPass::verifyAnalysis() const {
	if (!VerifySCEV)
	return;

	SE->verify();
	}

	void ScalarEvolutionWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
	AU.setPreservesAll();
	AU.addRequiredTransitive<AssumptionCacheTracker>();
	AU.addRequiredTransitive<LoopInfoWrapperPass>();
	AU.addRequiredTransitive<DominatorTreeWrapperPass>();
	AU.addRequiredTransitive<TargetLibraryInfoWrapperPass>();
	}

	const SCEVPredicate ScalarEvolution::getEqualPredicate(const SCEV LHS,
	const SCEV *RHS) {
	FoldingSetNodeID ID;
	assert(LHS->getType() == RHS->getType() &&
	"Type mismatch between LHS and RHS");
	// Unique this node based on the arguments
	ID.AddInteger(SCEVPredicate::P_Equal);
	ID.AddPointer(LHS);
	ID.AddPointer(RHS);
	void *IP = nullptr;
	if (const auto *S = UniquePreds.FindNodeOrInsertPos(ID, IP))
	return S;
	SCEVEqualPredicate *Eq = new (SCEVAllocator)
	SCEVEqualPredicate(ID.Intern(SCEVAllocator), LHS, RHS);
	UniquePreds.InsertNode(Eq, IP);
	return Eq;
	}

	const SCEVPredicate *ScalarEvolution::getWrapPredicate(
	const SCEVAddRecExpr *AR,
	SCEVWrapPredicate::IncrementWrapFlags AddedFlags) {
	FoldingSetNodeID ID;
	// Unique this node based on the arguments
	ID.AddInteger(SCEVPredicate::P_Wrap);
	ID.AddPointer(AR);
	ID.AddInteger(AddedFlags);
	void *IP = nullptr;
	if (const auto *S = UniquePreds.FindNodeOrInsertPos(ID, IP))
	return S;
	auto *OF = new (SCEVAllocator)
	SCEVWrapPredicate(ID.Intern(SCEVAllocator), AR, AddedFlags);
	UniquePreds.InsertNode(OF, IP);
	return OF;
	}

	namespace {

	class SCEVPredicateRewriter : public SCEVRewriteVisitor<SCEVPredicateRewriter> {
	public:
	/// Rewrites \p S in the context of a loop L and the SCEV predication
	/// infrastructure.
	///
	/// If \p Pred is non-null, the SCEV expression is rewritten to respect the
	/// equivalences present in \p Pred.
	///
	/// If \p NewPreds is non-null, rewrite is free to add further predicates to
	/// \p NewPreds such that the result will be an AddRecExpr.
	static const SCEV rewrite(const SCEV S, const Loop *L, ScalarEvolution &SE,
	SmallPtrSetImpl<const SCEVPredicate > NewPreds,
	SCEVUnionPredicate *Pred) {
	SCEVPredicateRewriter Rewriter(L, SE, NewPreds, Pred);
	return Rewriter.visit(S);
	}

	SCEVPredicateRewriter(const Loop *L, ScalarEvolution &SE,
	SmallPtrSetImpl<const SCEVPredicate > NewPreds,
	SCEVUnionPredicate *Pred)
	: SCEVRewriteVisitor(SE), NewPreds(NewPreds), Pred(Pred), L(L) {}

	const SCEV visitUnknown(const SCEVUnknown Expr) {
	if (Pred) {
	auto ExprPreds = Pred->getPredicatesForExpr(Expr);
	for (auto *Pred : ExprPreds)
	if (const auto *IPred = dyn_cast<SCEVEqualPredicate>(Pred))
	if (IPred->getLHS() == Expr)
	return IPred->getRHS();
	}
	return convertToAddRecWithPreds(Expr);
	}

	const SCEV visitZeroExtendExpr(const SCEVZeroExtendExpr Expr) {
	const SCEV *Operand = visit(Expr->getOperand());
	const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Operand);
	if (AR && AR->getLoop() == L && AR->isAffine()) {
	// This couldn't be folded because the operand didn't have the nuw
	// flag. Add the nusw flag as an assumption that we could make.
	const SCEV *Step = AR->getStepRecurrence(SE);
	Type *Ty = Expr->getType();
	if (addOverflowAssumption(AR, SCEVWrapPredicate::IncrementNUSW))
	return SE.getAddRecExpr(SE.getZeroExtendExpr(AR->getStart(), Ty),
	SE.getSignExtendExpr(Step, Ty), L,
	AR->getNoWrapFlags());
	}
	return SE.getZeroExtendExpr(Operand, Expr->getType());
	}

	const SCEV visitSignExtendExpr(const SCEVSignExtendExpr Expr) {
	const SCEV *Operand = visit(Expr->getOperand());
	const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Operand);
	if (AR && AR->getLoop() == L && AR->isAffine()) {
	// This couldn't be folded because the operand didn't have the nsw
	// flag. Add the nssw flag as an assumption that we could make.
	const SCEV *Step = AR->getStepRecurrence(SE);
	Type *Ty = Expr->getType();
	if (addOverflowAssumption(AR, SCEVWrapPredicate::IncrementNSSW))
	return SE.getAddRecExpr(SE.getSignExtendExpr(AR->getStart(), Ty),
	SE.getSignExtendExpr(Step, Ty), L,
	AR->getNoWrapFlags());
	}
	return SE.getSignExtendExpr(Operand, Expr->getType());
	}

	private:
	bool addOverflowAssumption(const SCEVPredicate *P) {
	if (!NewPreds) {
	// Check if we've already made this assumption.
	return Pred && Pred->implies(P);
	}
	NewPreds->insert(P);
	return true;
	}

	bool addOverflowAssumption(const SCEVAddRecExpr *AR,
	SCEVWrapPredicate::IncrementWrapFlags AddedFlags) {
	auto *A = SE.getWrapPredicate(AR, AddedFlags);
	return addOverflowAssumption(A);
	}

	// If \p Expr represents a PHINode, we try to see if it can be represented
	// as an AddRec, possibly under a predicate (PHISCEVPred). If it is possible
	// to add this predicate as a runtime overflow check, we return the AddRec.
	// If \p Expr does not meet these conditions (is not a PHI node, or we
	// couldn't create an AddRec for it, or couldn't add the predicate), we just
	// return \p Expr.
	const SCEV convertToAddRecWithPreds(const SCEVUnknown Expr) {
	if (!isa<PHINode>(Expr->getValue()))
	return Expr;
	Optional<std::pair<const SCEV , SmallVector<const SCEVPredicate , 3>>>
	PredicatedRewrite = SE.createAddRecFromPHIWithCasts(Expr);
	if (!PredicatedRewrite)
	return Expr;
	for (auto *P : PredicatedRewrite->second){
	if (!addOverflowAssumption(P))
	return Expr;
	}
	return PredicatedRewrite->first;
	}

	SmallPtrSetImpl<const SCEVPredicate > NewPreds;
	SCEVUnionPredicate *Pred;
	const Loop *L;
	};
	} // end anonymous namespace

	const SCEV ScalarEvolution::rewriteUsingPredicate(const SCEV S, const Loop *L,
	SCEVUnionPredicate &Preds) {
	return SCEVPredicateRewriter::rewrite(S, L, *this, nullptr, &Preds);
	}

	const SCEVAddRecExpr *ScalarEvolution::convertSCEVToAddRecWithPredicates(
	const SCEV S, const Loop L,
	SmallPtrSetImpl<const SCEVPredicate *> &Preds) {

	SmallPtrSet<const SCEVPredicate *, 4> TransformPreds;
	S = SCEVPredicateRewriter::rewrite(S, L, *this, &TransformPreds, nullptr);
	auto *AddRec = dyn_cast<SCEVAddRecExpr>(S);

	if (!AddRec)
	return nullptr;

	// Since the transformation was successful, we can now transfer the SCEV
	// predicates.
	for (auto *P : TransformPreds)
	Preds.insert(P);

	return AddRec;
	}

	/// SCEV predicates
	SCEVPredicate::SCEVPredicate(const FoldingSetNodeIDRef ID,
	SCEVPredicateKind Kind)
	: FastID(ID), Kind(Kind) {}

	SCEVEqualPredicate::SCEVEqualPredicate(const FoldingSetNodeIDRef ID,
	const SCEV LHS, const SCEV RHS)
	: SCEVPredicate(ID, P_Equal), LHS(LHS), RHS(RHS) {
	assert(LHS->getType() == RHS->getType() && "LHS and RHS types don't match");
	assert(LHS != RHS && "LHS and RHS are the same SCEV");
	}

	bool SCEVEqualPredicate::implies(const SCEVPredicate *N) const {
	const auto *Op = dyn_cast<SCEVEqualPredicate>(N);

	if (!Op)
	return false;

	return Op->LHS == LHS && Op->RHS == RHS;
	}

	bool SCEVEqualPredicate::isAlwaysTrue() const { return false; }

	const SCEV *SCEVEqualPredicate::getExpr() const { return LHS; }

	void SCEVEqualPredicate::print(raw_ostream &OS, unsigned Depth) const {
	OS.indent(Depth) << "Equal predicate: " << LHS << " == " << RHS << "\n";
	}

	SCEVWrapPredicate::SCEVWrapPredicate(const FoldingSetNodeIDRef ID,
	const SCEVAddRecExpr *AR,
	IncrementWrapFlags Flags)
	: SCEVPredicate(ID, P_Wrap), AR(AR), Flags(Flags) {}

	const SCEV *SCEVWrapPredicate::getExpr() const { return AR; }

	bool SCEVWrapPredicate::implies(const SCEVPredicate *N) const {
	const auto *Op = dyn_cast<SCEVWrapPredicate>(N);

	return Op && Op->AR == AR && setFlags(Flags, Op->Flags) == Flags;
	}

	bool SCEVWrapPredicate::isAlwaysTrue() const {
	SCEV::NoWrapFlags ScevFlags = AR->getNoWrapFlags();
	IncrementWrapFlags IFlags = Flags;

	if (ScalarEvolution::setFlags(ScevFlags, SCEV::FlagNSW) == ScevFlags)
	IFlags = clearFlags(IFlags, IncrementNSSW);

	return IFlags == IncrementAnyWrap;
	}

	void SCEVWrapPredicate::print(raw_ostream &OS, unsigned Depth) const {
	OS.indent(Depth) << *getExpr() << " Added Flags: ";
	if (SCEVWrapPredicate::IncrementNUSW & getFlags())
	OS << "<nusw>";
	if (SCEVWrapPredicate::IncrementNSSW & getFlags())
	OS << "<nssw>";
	OS << "\n";
	}

	SCEVWrapPredicate::IncrementWrapFlags
	SCEVWrapPredicate::getImpliedFlags(const SCEVAddRecExpr *AR,
	ScalarEvolution &SE) {
	IncrementWrapFlags ImpliedFlags = IncrementAnyWrap;
	SCEV::NoWrapFlags StaticFlags = AR->getNoWrapFlags();

	// We can safely transfer the NSW flag as NSSW.
	if (ScalarEvolution::setFlags(StaticFlags, SCEV::FlagNSW) == StaticFlags)
	ImpliedFlags = IncrementNSSW;

	if (ScalarEvolution::setFlags(StaticFlags, SCEV::FlagNUW) == StaticFlags) {
	// If the increment is positive, the SCEV NUW flag will also imply the
	// WrapPredicate NUSW flag.
	if (const auto *Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(SE)))
	if (Step->getValue()->getValue().isNonNegative())
	ImpliedFlags = setFlags(ImpliedFlags, IncrementNUSW);
	}

	return ImpliedFlags;
	}

	/// Union predicates don't get cached so create a dummy set ID for it.
	SCEVUnionPredicate::SCEVUnionPredicate()
	: SCEVPredicate(FoldingSetNodeIDRef(nullptr, 0), P_Union) {}

	bool SCEVUnionPredicate::isAlwaysTrue() const {
	return all_of(Preds,
	[](const SCEVPredicate *I) { return I->isAlwaysTrue(); });
	}

	ArrayRef<const SCEVPredicate *>
	SCEVUnionPredicate::getPredicatesForExpr(const SCEV *Expr) {
	auto I = SCEVToPreds.find(Expr);
	if (I == SCEVToPreds.end())
	return ArrayRef<const SCEVPredicate *>();
	return I->second;
	}

	bool SCEVUnionPredicate::implies(const SCEVPredicate *N) const {
	if (const auto *Set = dyn_cast<SCEVUnionPredicate>(N))
	return all_of(Set->Preds,
	[this](const SCEVPredicate *I) { return this->implies(I); });

	auto ScevPredsIt = SCEVToPreds.find(N->getExpr());
	if (ScevPredsIt == SCEVToPreds.end())
	return false;
	auto &SCEVPreds = ScevPredsIt->second;

	return any_of(SCEVPreds,
	[N](const SCEVPredicate *I) { return I->implies(N); });
	}

	const SCEV *SCEVUnionPredicate::getExpr() const { return nullptr; }

	void SCEVUnionPredicate::print(raw_ostream &OS, unsigned Depth) const {
	for (auto Pred : Preds)
	Pred->print(OS, Depth);
	}

	void SCEVUnionPredicate::add(const SCEVPredicate *N) {
	if (const auto *Set = dyn_cast<SCEVUnionPredicate>(N)) {
	for (auto Pred : Set->Preds)
	add(Pred);
	return;
	}

	if (implies(N))
	return;

	const SCEV *Key = N->getExpr();
	assert(Key && "Only SCEVUnionPredicate doesn't have an "
	" associated expression!");

	SCEVToPreds[Key].push_back(N);
	Preds.push_back(N);
	}

	PredicatedScalarEvolution::PredicatedScalarEvolution(ScalarEvolution &SE,
	Loop &L)
	: SE(SE), L(L), Generation(0), BackedgeCount(nullptr) {}

	const SCEV PredicatedScalarEvolution::getSCEV(Value V) {
	const SCEV *Expr = SE.getSCEV(V);
	RewriteEntry &Entry = RewriteMap[Expr];

	// If we already have an entry and the version matches, return it.
	if (Entry.second && Generation == Entry.first)
	return Entry.second;

	// We found an entry but it's stale. Rewrite the stale entry
	// according to the current predicate.
	if (Entry.second)
	Expr = Entry.second;

	const SCEV *NewSCEV = SE.rewriteUsingPredicate(Expr, &L, Preds);
	Entry = {Generation, NewSCEV};

	return NewSCEV;
	}

	const SCEV *PredicatedScalarEvolution::getBackedgeTakenCount() {
	if (!BackedgeCount) {
	SCEVUnionPredicate BackedgePred;
	BackedgeCount = SE.getPredicatedBackedgeTakenCount(&L, BackedgePred);
	addPredicate(BackedgePred);
	}
	return BackedgeCount;
	}

	void PredicatedScalarEvolution::addPredicate(const SCEVPredicate &Pred) {
	if (Preds.implies(&Pred))
	return;
	Preds.add(&Pred);
	updateGeneration();
	}

	const SCEVUnionPredicate &PredicatedScalarEvolution::getUnionPredicate() const {
	return Preds;
	}

	void PredicatedScalarEvolution::updateGeneration() {
	// If the generation number wrapped recompute everything.
	if (++Generation == 0) {
	for (auto &II : RewriteMap) {
	const SCEV *Rewritten = II.second.second;
	II.second = {Generation, SE.rewriteUsingPredicate(Rewritten, &L, Preds)};
	}
	}
	}

	void PredicatedScalarEvolution::setNoOverflow(
	Value *V, SCEVWrapPredicate::IncrementWrapFlags Flags) {
	const SCEV *Expr = getSCEV(V);
	const auto *AR = cast<SCEVAddRecExpr>(Expr);

	auto ImpliedFlags = SCEVWrapPredicate::getImpliedFlags(AR, SE);

	// Clear the statically implied flags.
	Flags = SCEVWrapPredicate::clearFlags(Flags, ImpliedFlags);
	addPredicate(*SE.getWrapPredicate(AR, Flags));

	auto II = FlagsMap.insert({V, Flags});
	if (!II.second)
	II.first->second = SCEVWrapPredicate::setFlags(Flags, II.first->second);
	}

	bool PredicatedScalarEvolution::hasNoOverflow(
	Value *V, SCEVWrapPredicate::IncrementWrapFlags Flags) {
	const SCEV *Expr = getSCEV(V);
	const auto *AR = cast<SCEVAddRecExpr>(Expr);

	Flags = SCEVWrapPredicate::clearFlags(
	Flags, SCEVWrapPredicate::getImpliedFlags(AR, SE));

	auto II = FlagsMap.find(V);

	if (II != FlagsMap.end())
	Flags = SCEVWrapPredicate::clearFlags(Flags, II->second);

	return Flags == SCEVWrapPredicate::IncrementAnyWrap;
	}

	const SCEVAddRecExpr PredicatedScalarEvolution::getAsAddRec(Value V) {
	const SCEV *Expr = this->getSCEV(V);
	SmallPtrSet<const SCEVPredicate *, 4> NewPreds;
	auto *New = SE.convertSCEVToAddRecWithPredicates(Expr, &L, NewPreds);

	if (!New)
	return nullptr;

	for (auto *P : NewPreds)
	Preds.add(P);

	updateGeneration();
	RewriteMap[SE.getSCEV(V)] = {Generation, New};
	return New;
	}

	PredicatedScalarEvolution::PredicatedScalarEvolution(
	const PredicatedScalarEvolution &Init)
	: RewriteMap(Init.RewriteMap), SE(Init.SE), L(Init.L), Preds(Init.Preds),
	Generation(Init.Generation), BackedgeCount(Init.BackedgeCount) {
	for (const auto &I : Init.FlagsMap)
	FlagsMap.insert(I);
	}

	void PredicatedScalarEvolution::print(raw_ostream &OS, unsigned Depth) const {
	// For each block.
	for (auto *BB : L.getBlocks())
	for (auto &I : *BB) {
	if (!SE.isSCEVable(I.getType()))
	continue;

	auto *Expr = SE.getSCEV(&I);
	auto II = RewriteMap.find(Expr);

	if (II == RewriteMap.end())
	continue;

	// Don't print things that are not interesting.
	if (II->second.second == Expr)
	continue;

	OS.indent(Depth) << "[PSE]" << I << ":\n";
	OS.indent(Depth + 2) << *Expr << "\n";
	OS.indent(Depth + 2) << "--> " << *II->second.second << "\n";
	}
	}
	diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp
	index 439b21a81258..cdfe74d158c9 100644
	--- a/lib/Analysis/ValueTracking.cpp
	+++ b/lib/Analysis/ValueTracking.cpp
	@@ -1,4541 +1,4545 @@
	//===- ValueTracking.cpp - Walk computations to compute properties --------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file contains routines that help analyze properties that chains of
	// computations have.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/Analysis/ValueTracking.h"
	#include "llvm/ADT/Optional.h"
	#include "llvm/ADT/SmallPtrSet.h"
	#include "llvm/Analysis/AssumptionCache.h"
	#include "llvm/Analysis/InstructionSimplify.h"
	#include "llvm/Analysis/Loads.h"
	#include "llvm/Analysis/LoopInfo.h"
	#include "llvm/Analysis/MemoryBuiltins.h"
	#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
	#include "llvm/Analysis/VectorUtils.h"
	#include "llvm/IR/CallSite.h"
	#include "llvm/IR/ConstantRange.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/Dominators.h"
	#include "llvm/IR/GetElementPtrTypeIterator.h"
	#include "llvm/IR/GlobalAlias.h"
	#include "llvm/IR/GlobalVariable.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/IntrinsicInst.h"
	#include "llvm/IR/LLVMContext.h"
	#include "llvm/IR/Metadata.h"
	#include "llvm/IR/Operator.h"
	#include "llvm/IR/PatternMatch.h"
	#include "llvm/IR/Statepoint.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/KnownBits.h"
	#include "llvm/Support/MathExtras.h"
	#include <algorithm>
	#include <array>
	#include <cstring>
	using namespace llvm;
	using namespace llvm::PatternMatch;

	const unsigned MaxDepth = 6;

	// Controls the number of uses of the value searched for possible
	// dominating comparisons.
	static cl::opt<unsigned> DomConditionsMaxUses("dom-conditions-max-uses",
	cl::Hidden, cl::init(20));

	// This optimization is known to cause performance regressions is some cases,
	// keep it under a temporary flag for now.
	static cl::opt<bool>
	DontImproveNonNegativePhiBits("dont-improve-non-negative-phi-bits",
	cl::Hidden, cl::init(true));

	/// Returns the bitwidth of the given scalar or pointer type. For vector types,
	/// returns the element type's bitwidth.
	static unsigned getBitWidth(Type *Ty, const DataLayout &DL) {
	if (unsigned BitWidth = Ty->getScalarSizeInBits())
	return BitWidth;

	return DL.getPointerTypeSizeInBits(Ty);
	}

	namespace {
	// Simplifying using an assume can only be done in a particular control-flow
	// context (the context instruction provides that context). If an assume and
	// the context instruction are not in the same block then the DT helps in
	// figuring out if we can use it.
	struct Query {
	const DataLayout &DL;
	AssumptionCache *AC;
	const Instruction *CxtI;
	const DominatorTree *DT;
	// Unlike the other analyses, this may be a nullptr because not all clients
	// provide it currently.
	OptimizationRemarkEmitter *ORE;

	/// Set of assumptions that should be excluded from further queries.
	/// This is because of the potential for mutual recursion to cause
	/// computeKnownBits to repeatedly visit the same assume intrinsic. The
	/// classic case of this is assume(x = y), which will attempt to determine
	/// bits in x from bits in y, which will attempt to determine bits in y from
	/// bits in x, etc. Regarding the mutual recursion, computeKnownBits can call
	/// isKnownNonZero, which calls computeKnownBits and isKnownToBeAPowerOfTwo
	/// (all of which can call computeKnownBits), and so on.
	std::array<const Value *, MaxDepth> Excluded;
	unsigned NumExcluded;

	Query(const DataLayout &DL, AssumptionCache AC, const Instruction CxtI,
	const DominatorTree DT, OptimizationRemarkEmitter ORE = nullptr)
	: DL(DL), AC(AC), CxtI(CxtI), DT(DT), ORE(ORE), NumExcluded(0) {}

	Query(const Query &Q, const Value *NewExcl)
	: DL(Q.DL), AC(Q.AC), CxtI(Q.CxtI), DT(Q.DT), ORE(Q.ORE),
	NumExcluded(Q.NumExcluded) {
	Excluded = Q.Excluded;
	Excluded[NumExcluded++] = NewExcl;
	assert(NumExcluded <= Excluded.size());
	}

	bool isExcluded(const Value *Value) const {
	if (NumExcluded == 0)
	return false;
	auto End = Excluded.begin() + NumExcluded;
	return std::find(Excluded.begin(), End, Value) != End;
	}
	};
	} // end anonymous namespace

	// Given the provided Value and, potentially, a context instruction, return
	// the preferred context instruction (if any).
	static const Instruction safeCxtI(const Value V, const Instruction *CxtI) {
	// If we've been provided with a context instruction, then use that (provided
	// it has been inserted).
	if (CxtI && CxtI->getParent())
	return CxtI;

	// If the value is really an already-inserted instruction, then use that.
	CxtI = dyn_cast<Instruction>(V);
	if (CxtI && CxtI->getParent())
	return CxtI;

	return nullptr;
	}

	static void computeKnownBits(const Value *V, KnownBits &Known,
	unsigned Depth, const Query &Q);

	void llvm::computeKnownBits(const Value *V, KnownBits &Known,
	const DataLayout &DL, unsigned Depth,
	AssumptionCache AC, const Instruction CxtI,
	const DominatorTree *DT,
	OptimizationRemarkEmitter *ORE) {
	::computeKnownBits(V, Known, Depth,
	Query(DL, AC, safeCxtI(V, CxtI), DT, ORE));
	}

	static KnownBits computeKnownBits(const Value *V, unsigned Depth,
	const Query &Q);

	KnownBits llvm::computeKnownBits(const Value *V, const DataLayout &DL,
	unsigned Depth, AssumptionCache *AC,
	const Instruction *CxtI,
	const DominatorTree *DT,
	OptimizationRemarkEmitter *ORE) {
	return ::computeKnownBits(V, Depth,
	Query(DL, AC, safeCxtI(V, CxtI), DT, ORE));
	}

	bool llvm::haveNoCommonBitsSet(const Value LHS, const Value RHS,
	const DataLayout &DL,
	AssumptionCache AC, const Instruction CxtI,
	const DominatorTree *DT) {
	assert(LHS->getType() == RHS->getType() &&
	"LHS and RHS should have the same type");
	assert(LHS->getType()->isIntOrIntVectorTy() &&
	"LHS and RHS should be integers");
	IntegerType *IT = cast<IntegerType>(LHS->getType()->getScalarType());
	KnownBits LHSKnown(IT->getBitWidth());
	KnownBits RHSKnown(IT->getBitWidth());
	computeKnownBits(LHS, LHSKnown, DL, 0, AC, CxtI, DT);
	computeKnownBits(RHS, RHSKnown, DL, 0, AC, CxtI, DT);
	return (LHSKnown.Zero \| RHSKnown.Zero).isAllOnesValue();
	}


	bool llvm::isOnlyUsedInZeroEqualityComparison(const Instruction *CxtI) {
	for (const User *U : CxtI->users()) {
	if (const ICmpInst *IC = dyn_cast<ICmpInst>(U))
	if (IC->isEquality())
	if (Constant *C = dyn_cast<Constant>(IC->getOperand(1)))
	if (C->isNullValue())
	continue;
	return false;
	}
	return true;
	}

	static bool isKnownToBeAPowerOfTwo(const Value *V, bool OrZero, unsigned Depth,
	const Query &Q);

	bool llvm::isKnownToBeAPowerOfTwo(const Value *V, const DataLayout &DL,
	bool OrZero,
	unsigned Depth, AssumptionCache *AC,
	const Instruction *CxtI,
	const DominatorTree *DT) {
	return ::isKnownToBeAPowerOfTwo(V, OrZero, Depth,
	Query(DL, AC, safeCxtI(V, CxtI), DT));
	}

	static bool isKnownNonZero(const Value *V, unsigned Depth, const Query &Q);

	bool llvm::isKnownNonZero(const Value *V, const DataLayout &DL, unsigned Depth,
	AssumptionCache AC, const Instruction CxtI,
	const DominatorTree *DT) {
	return ::isKnownNonZero(V, Depth, Query(DL, AC, safeCxtI(V, CxtI), DT));
	}

	bool llvm::isKnownNonNegative(const Value *V, const DataLayout &DL,
	unsigned Depth,
	AssumptionCache AC, const Instruction CxtI,
	const DominatorTree *DT) {
	KnownBits Known = computeKnownBits(V, DL, Depth, AC, CxtI, DT);
	return Known.isNonNegative();
	}

	bool llvm::isKnownPositive(const Value *V, const DataLayout &DL, unsigned Depth,
	AssumptionCache AC, const Instruction CxtI,
	const DominatorTree *DT) {
	if (auto *CI = dyn_cast<ConstantInt>(V))
	return CI->getValue().isStrictlyPositive();

	// TODO: We'd doing two recursive queries here. We should factor this such
	// that only a single query is needed.
	return isKnownNonNegative(V, DL, Depth, AC, CxtI, DT) &&
	isKnownNonZero(V, DL, Depth, AC, CxtI, DT);
	}

	bool llvm::isKnownNegative(const Value *V, const DataLayout &DL, unsigned Depth,
	AssumptionCache AC, const Instruction CxtI,
	const DominatorTree *DT) {
	KnownBits Known = computeKnownBits(V, DL, Depth, AC, CxtI, DT);
	return Known.isNegative();
	}

	static bool isKnownNonEqual(const Value V1, const Value V2, const Query &Q);

	bool llvm::isKnownNonEqual(const Value V1, const Value V2,
	const DataLayout &DL,
	AssumptionCache AC, const Instruction CxtI,
	const DominatorTree *DT) {
	return ::isKnownNonEqual(V1, V2, Query(DL, AC,
	safeCxtI(V1, safeCxtI(V2, CxtI)),
	DT));
	}

	static bool MaskedValueIsZero(const Value *V, const APInt &Mask, unsigned Depth,
	const Query &Q);

	bool llvm::MaskedValueIsZero(const Value *V, const APInt &Mask,
	const DataLayout &DL,
	unsigned Depth, AssumptionCache *AC,
	const Instruction CxtI, const DominatorTree DT) {
	return ::MaskedValueIsZero(V, Mask, Depth,
	Query(DL, AC, safeCxtI(V, CxtI), DT));
	}

	static unsigned ComputeNumSignBits(const Value *V, unsigned Depth,
	const Query &Q);

	unsigned llvm::ComputeNumSignBits(const Value *V, const DataLayout &DL,
	unsigned Depth, AssumptionCache *AC,
	const Instruction *CxtI,
	const DominatorTree *DT) {
	return ::ComputeNumSignBits(V, Depth, Query(DL, AC, safeCxtI(V, CxtI), DT));
	}

	static void computeKnownBitsAddSub(bool Add, const Value Op0, const Value Op1,
	bool NSW,
	KnownBits &KnownOut, KnownBits &Known2,
	unsigned Depth, const Query &Q) {
	unsigned BitWidth = KnownOut.getBitWidth();

	// If an initial sequence of bits in the result is not needed, the
	// corresponding bits in the operands are not needed.
	KnownBits LHSKnown(BitWidth);
	computeKnownBits(Op0, LHSKnown, Depth + 1, Q);
	computeKnownBits(Op1, Known2, Depth + 1, Q);

	// Carry in a 1 for a subtract, rather than a 0.
	uint64_t CarryIn = 0;
	if (!Add) {
	// Sum = LHS + ~RHS + 1
	std::swap(Known2.Zero, Known2.One);
	CarryIn = 1;
	}

	APInt PossibleSumZero = ~LHSKnown.Zero + ~Known2.Zero + CarryIn;
	APInt PossibleSumOne = LHSKnown.One + Known2.One + CarryIn;

	// Compute known bits of the carry.
	APInt CarryKnownZero = ~(PossibleSumZero ^ LHSKnown.Zero ^ Known2.Zero);
	APInt CarryKnownOne = PossibleSumOne ^ LHSKnown.One ^ Known2.One;

	// Compute set of known bits (where all three relevant bits are known).
	APInt LHSKnownUnion = LHSKnown.Zero \| LHSKnown.One;
	APInt RHSKnownUnion = Known2.Zero \| Known2.One;
	APInt CarryKnownUnion = CarryKnownZero \| CarryKnownOne;
	APInt Known = LHSKnownUnion & RHSKnownUnion & CarryKnownUnion;

	assert((PossibleSumZero & Known) == (PossibleSumOne & Known) &&
	"known bits of sum differ");

	// Compute known bits of the result.
	KnownOut.Zero = ~PossibleSumOne & Known;
	KnownOut.One = PossibleSumOne & Known;

	// Are we still trying to solve for the sign bit?
	if (!Known.isSignBitSet()) {
	if (NSW) {
	// Adding two non-negative numbers, or subtracting a negative number from
	// a non-negative one, can't wrap into negative.
	if (LHSKnown.isNonNegative() && Known2.isNonNegative())
	KnownOut.makeNonNegative();
	// Adding two negative numbers, or subtracting a non-negative number from
	// a negative one, can't wrap into non-negative.
	else if (LHSKnown.isNegative() && Known2.isNegative())
	KnownOut.makeNegative();
	}
	}
	}

	static void computeKnownBitsMul(const Value Op0, const Value Op1, bool NSW,
	KnownBits &Known, KnownBits &Known2,
	unsigned Depth, const Query &Q) {
	unsigned BitWidth = Known.getBitWidth();
	computeKnownBits(Op1, Known, Depth + 1, Q);
	computeKnownBits(Op0, Known2, Depth + 1, Q);

	bool isKnownNegative = false;
	bool isKnownNonNegative = false;
	// If the multiplication is known not to overflow, compute the sign bit.
	if (NSW) {
	if (Op0 == Op1) {
	// The product of a number with itself is non-negative.
	isKnownNonNegative = true;
	} else {
	bool isKnownNonNegativeOp1 = Known.isNonNegative();
	bool isKnownNonNegativeOp0 = Known2.isNonNegative();
	bool isKnownNegativeOp1 = Known.isNegative();
	bool isKnownNegativeOp0 = Known2.isNegative();
	// The product of two numbers with the same sign is non-negative.
	isKnownNonNegative = (isKnownNegativeOp1 && isKnownNegativeOp0) \|\|
	(isKnownNonNegativeOp1 && isKnownNonNegativeOp0);
	// The product of a negative number and a non-negative number is either
	// negative or zero.
	if (!isKnownNonNegative)
	isKnownNegative = (isKnownNegativeOp1 && isKnownNonNegativeOp0 &&
	isKnownNonZero(Op0, Depth, Q)) \|\|
	(isKnownNegativeOp0 && isKnownNonNegativeOp1 &&
	isKnownNonZero(Op1, Depth, Q));
	}
	}

	// If low bits are zero in either operand, output low known-0 bits.
	// Also compute a conservative estimate for high known-0 bits.
	// More trickiness is possible, but this is sufficient for the
	// interesting case of alignment computation.
	unsigned TrailZ = Known.countMinTrailingZeros() +
	Known2.countMinTrailingZeros();
	unsigned LeadZ = std::max(Known.countMinLeadingZeros() +
	Known2.countMinLeadingZeros(),
	BitWidth) - BitWidth;

	TrailZ = std::min(TrailZ, BitWidth);
	LeadZ = std::min(LeadZ, BitWidth);
	Known.resetAll();
	Known.Zero.setLowBits(TrailZ);
	Known.Zero.setHighBits(LeadZ);

	// Only make use of no-wrap flags if we failed to compute the sign bit
	// directly. This matters if the multiplication always overflows, in
	// which case we prefer to follow the result of the direct computation,
	// though as the program is invoking undefined behaviour we can choose
	// whatever we like here.
	if (isKnownNonNegative && !Known.isNegative())
	Known.makeNonNegative();
	else if (isKnownNegative && !Known.isNonNegative())
	Known.makeNegative();
	}

	void llvm::computeKnownBitsFromRangeMetadata(const MDNode &Ranges,
	KnownBits &Known) {
	unsigned BitWidth = Known.getBitWidth();
	unsigned NumRanges = Ranges.getNumOperands() / 2;
	assert(NumRanges >= 1);

	Known.Zero.setAllBits();
	Known.One.setAllBits();

	for (unsigned i = 0; i < NumRanges; ++i) {
	ConstantInt *Lower =
	mdconst::extract<ConstantInt>(Ranges.getOperand(2 * i + 0));
	ConstantInt *Upper =
	mdconst::extract<ConstantInt>(Ranges.getOperand(2 * i + 1));
	ConstantRange Range(Lower->getValue(), Upper->getValue());

	// The first CommonPrefixBits of all values in Range are equal.
	unsigned CommonPrefixBits =
	(Range.getUnsignedMax() ^ Range.getUnsignedMin()).countLeadingZeros();

	APInt Mask = APInt::getHighBitsSet(BitWidth, CommonPrefixBits);
	Known.One &= Range.getUnsignedMax() & Mask;
	Known.Zero &= ~Range.getUnsignedMax() & Mask;
	}
	}

	static bool isEphemeralValueOf(const Instruction I, const Value E) {
	SmallVector<const Value *, 16> WorkSet(1, I);
	SmallPtrSet<const Value *, 32> Visited;
	SmallPtrSet<const Value *, 16> EphValues;

	// The instruction defining an assumption's condition itself is always
	// considered ephemeral to that assumption (even if it has other
	// non-ephemeral users). See r246696's test case for an example.
	if (is_contained(I->operands(), E))
	return true;

	while (!WorkSet.empty()) {
	const Value *V = WorkSet.pop_back_val();
	if (!Visited.insert(V).second)
	continue;

	// If all uses of this value are ephemeral, then so is this value.
	if (all_of(V->users(), [&](const User *U) { return EphValues.count(U); })) {
	if (V == E)
	return true;

	EphValues.insert(V);
	if (const User *U = dyn_cast<User>(V))
	for (User::const_op_iterator J = U->op_begin(), JE = U->op_end();
	J != JE; ++J) {
	if (isSafeToSpeculativelyExecute(*J))
	WorkSet.push_back(*J);
	}
	}
	}

	return false;
	}

	// Is this an intrinsic that cannot be speculated but also cannot trap?
	static bool isAssumeLikeIntrinsic(const Instruction *I) {
	if (const CallInst *CI = dyn_cast<CallInst>(I))
	if (Function *F = CI->getCalledFunction())
	switch (F->getIntrinsicID()) {
	default: break;
	// FIXME: This list is repeated from NoTTI::getIntrinsicCost.
	case Intrinsic::assume:
	case Intrinsic::dbg_declare:
	case Intrinsic::dbg_value:
	case Intrinsic::invariant_start:
	case Intrinsic::invariant_end:
	case Intrinsic::lifetime_start:
	case Intrinsic::lifetime_end:
	case Intrinsic::objectsize:
	case Intrinsic::ptr_annotation:
	case Intrinsic::var_annotation:
	return true;
	}

	return false;
	}

	bool llvm::isValidAssumeForContext(const Instruction *Inv,
	const Instruction *CxtI,
	const DominatorTree *DT) {

	// There are two restrictions on the use of an assume:
	// 1. The assume must dominate the context (or the control flow must
	// reach the assume whenever it reaches the context).
	// 2. The context must not be in the assume's set of ephemeral values
	// (otherwise we will use the assume to prove that the condition
	// feeding the assume is trivially true, thus causing the removal of
	// the assume).

	if (DT) {
	if (DT->dominates(Inv, CxtI))
	return true;
	} else if (Inv->getParent() == CxtI->getParent()->getSinglePredecessor()) {
	// We don't have a DT, but this trivially dominates.
	return true;
	}

	// With or without a DT, the only remaining case we will check is if the
	// instructions are in the same BB. Give up if that is not the case.
	if (Inv->getParent() != CxtI->getParent())
	return false;

	// If we have a dom tree, then we now know that the assume doens't dominate
	// the other instruction. If we don't have a dom tree then we can check if
	// the assume is first in the BB.
	if (!DT) {
	// Search forward from the assume until we reach the context (or the end
	// of the block); the common case is that the assume will come first.
	for (auto I = std::next(BasicBlock::const_iterator(Inv)),
	IE = Inv->getParent()->end(); I != IE; ++I)
	if (&*I == CxtI)
	return true;
	}

	// The context comes first, but they're both in the same block. Make sure
	// there is nothing in between that might interrupt the control flow.
	for (BasicBlock::const_iterator I =
	std::next(BasicBlock::const_iterator(CxtI)), IE(Inv);
	I != IE; ++I)
	if (!isSafeToSpeculativelyExecute(&I) && !isAssumeLikeIntrinsic(&I))
	return false;

	return !isEphemeralValueOf(Inv, CxtI);
	}

	static void computeKnownBitsFromAssume(const Value *V, KnownBits &Known,
	unsigned Depth, const Query &Q) {
	// Use of assumptions is context-sensitive. If we don't have a context, we
	// cannot use them!
	if (!Q.AC \|\| !Q.CxtI)
	return;

	unsigned BitWidth = Known.getBitWidth();

	// Note that the patterns below need to be kept in sync with the code
	// in AssumptionCache::updateAffectedValues.

	for (auto &AssumeVH : Q.AC->assumptionsFor(V)) {
	if (!AssumeVH)
	continue;
	CallInst *I = cast<CallInst>(AssumeVH);
	assert(I->getParent()->getParent() == Q.CxtI->getParent()->getParent() &&
	"Got assumption for the wrong function!");
	if (Q.isExcluded(I))
	continue;

	// Warning: This loop can end up being somewhat performance sensetive.
	// We're running this loop for once for each value queried resulting in a
	// runtime of ~O(#assumes * #values).

	assert(I->getCalledFunction()->getIntrinsicID() == Intrinsic::assume &&
	"must be an assume intrinsic");

	Value *Arg = I->getArgOperand(0);

	if (Arg == V && isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
	assert(BitWidth == 1 && "assume operand is not i1?");
	Known.setAllOnes();
	return;
	}
	if (match(Arg, m_Not(m_Specific(V))) &&
	isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
	assert(BitWidth == 1 && "assume operand is not i1?");
	Known.setAllZero();
	return;
	}

	// The remaining tests are all recursive, so bail out if we hit the limit.
	if (Depth == MaxDepth)
	continue;

	Value A, B;
	auto m_V = m_CombineOr(m_Specific(V),
	m_CombineOr(m_PtrToInt(m_Specific(V)),
	m_BitCast(m_Specific(V))));

	CmpInst::Predicate Pred;
	ConstantInt *C;
	// assume(v = a)
	if (match(Arg, m_c_ICmp(Pred, m_V, m_Value(A))) &&
	Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
	KnownBits RHSKnown(BitWidth);
	computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
	Known.Zero \|= RHSKnown.Zero;
	Known.One \|= RHSKnown.One;
	// assume(v & b = a)
	} else if (match(Arg,
	m_c_ICmp(Pred, m_c_And(m_V, m_Value(B)), m_Value(A))) &&
	Pred == ICmpInst::ICMP_EQ &&
	isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
	KnownBits RHSKnown(BitWidth);
	computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
	KnownBits MaskKnown(BitWidth);
	computeKnownBits(B, MaskKnown, Depth+1, Query(Q, I));

	// For those bits in the mask that are known to be one, we can propagate
	// known bits from the RHS to V.
	Known.Zero \|= RHSKnown.Zero & MaskKnown.One;
	Known.One \|= RHSKnown.One & MaskKnown.One;
	// assume(~(v & b) = a)
	} else if (match(Arg, m_c_ICmp(Pred, m_Not(m_c_And(m_V, m_Value(B))),
	m_Value(A))) &&
	Pred == ICmpInst::ICMP_EQ &&
	isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
	KnownBits RHSKnown(BitWidth);
	computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
	KnownBits MaskKnown(BitWidth);
	computeKnownBits(B, MaskKnown, Depth+1, Query(Q, I));

	// For those bits in the mask that are known to be one, we can propagate
	// inverted known bits from the RHS to V.
	Known.Zero \|= RHSKnown.One & MaskKnown.One;
	Known.One \|= RHSKnown.Zero & MaskKnown.One;
	// assume(v \| b = a)
	} else if (match(Arg,
	m_c_ICmp(Pred, m_c_Or(m_V, m_Value(B)), m_Value(A))) &&
	Pred == ICmpInst::ICMP_EQ &&
	isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
	KnownBits RHSKnown(BitWidth);
	computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
	KnownBits BKnown(BitWidth);
	computeKnownBits(B, BKnown, Depth+1, Query(Q, I));

	// For those bits in B that are known to be zero, we can propagate known
	// bits from the RHS to V.
	Known.Zero \|= RHSKnown.Zero & BKnown.Zero;
	Known.One \|= RHSKnown.One & BKnown.Zero;
	// assume(~(v \| b) = a)
	} else if (match(Arg, m_c_ICmp(Pred, m_Not(m_c_Or(m_V, m_Value(B))),
	m_Value(A))) &&
	Pred == ICmpInst::ICMP_EQ &&
	isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
	KnownBits RHSKnown(BitWidth);
	computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
	KnownBits BKnown(BitWidth);
	computeKnownBits(B, BKnown, Depth+1, Query(Q, I));

	// For those bits in B that are known to be zero, we can propagate
	// inverted known bits from the RHS to V.
	Known.Zero \|= RHSKnown.One & BKnown.Zero;
	Known.One \|= RHSKnown.Zero & BKnown.Zero;
	// assume(v ^ b = a)
	} else if (match(Arg,
	m_c_ICmp(Pred, m_c_Xor(m_V, m_Value(B)), m_Value(A))) &&
	Pred == ICmpInst::ICMP_EQ &&
	isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
	KnownBits RHSKnown(BitWidth);
	computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
	KnownBits BKnown(BitWidth);
	computeKnownBits(B, BKnown, Depth+1, Query(Q, I));

	// For those bits in B that are known to be zero, we can propagate known
	// bits from the RHS to V. For those bits in B that are known to be one,
	// we can propagate inverted known bits from the RHS to V.
	Known.Zero \|= RHSKnown.Zero & BKnown.Zero;
	Known.One \|= RHSKnown.One & BKnown.Zero;
	Known.Zero \|= RHSKnown.One & BKnown.One;
	Known.One \|= RHSKnown.Zero & BKnown.One;
	// assume(~(v ^ b) = a)
	} else if (match(Arg, m_c_ICmp(Pred, m_Not(m_c_Xor(m_V, m_Value(B))),
	m_Value(A))) &&
	Pred == ICmpInst::ICMP_EQ &&
	isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
	KnownBits RHSKnown(BitWidth);
	computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
	KnownBits BKnown(BitWidth);
	computeKnownBits(B, BKnown, Depth+1, Query(Q, I));

	// For those bits in B that are known to be zero, we can propagate
	// inverted known bits from the RHS to V. For those bits in B that are
	// known to be one, we can propagate known bits from the RHS to V.
	Known.Zero \|= RHSKnown.One & BKnown.Zero;
	Known.One \|= RHSKnown.Zero & BKnown.Zero;
	Known.Zero \|= RHSKnown.Zero & BKnown.One;
	Known.One \|= RHSKnown.One & BKnown.One;
	// assume(v << c = a)
	} else if (match(Arg, m_c_ICmp(Pred, m_Shl(m_V, m_ConstantInt(C)),
	m_Value(A))) &&
	Pred == ICmpInst::ICMP_EQ &&
	isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
	KnownBits RHSKnown(BitWidth);
	computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
	// For those bits in RHS that are known, we can propagate them to known
	// bits in V shifted to the right by C.
	RHSKnown.Zero.lshrInPlace(C->getZExtValue());
	Known.Zero \|= RHSKnown.Zero;
	RHSKnown.One.lshrInPlace(C->getZExtValue());
	Known.One \|= RHSKnown.One;
	// assume(~(v << c) = a)
	} else if (match(Arg, m_c_ICmp(Pred, m_Not(m_Shl(m_V, m_ConstantInt(C))),
	m_Value(A))) &&
	Pred == ICmpInst::ICMP_EQ &&
	isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
	KnownBits RHSKnown(BitWidth);
	computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
	// For those bits in RHS that are known, we can propagate them inverted
	// to known bits in V shifted to the right by C.
	RHSKnown.One.lshrInPlace(C->getZExtValue());
	Known.Zero \|= RHSKnown.One;
	RHSKnown.Zero.lshrInPlace(C->getZExtValue());
	Known.One \|= RHSKnown.Zero;
	// assume(v >> c = a)
	} else if (match(Arg,
	m_c_ICmp(Pred, m_Shr(m_V, m_ConstantInt(C)),
	m_Value(A))) &&
	Pred == ICmpInst::ICMP_EQ &&
	isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
	KnownBits RHSKnown(BitWidth);
	computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
	// For those bits in RHS that are known, we can propagate them to known
	// bits in V shifted to the right by C.
	Known.Zero \|= RHSKnown.Zero << C->getZExtValue();
	Known.One \|= RHSKnown.One << C->getZExtValue();
	// assume(~(v >> c) = a)
	} else if (match(Arg, m_c_ICmp(Pred, m_Not(m_Shr(m_V, m_ConstantInt(C))),
	m_Value(A))) &&
	Pred == ICmpInst::ICMP_EQ &&
	isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
	KnownBits RHSKnown(BitWidth);
	computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
	// For those bits in RHS that are known, we can propagate them inverted
	// to known bits in V shifted to the right by C.
	Known.Zero \|= RHSKnown.One << C->getZExtValue();
	Known.One \|= RHSKnown.Zero << C->getZExtValue();
	// assume(v >=_s c) where c is non-negative
	} else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) &&
	Pred == ICmpInst::ICMP_SGE &&
	isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
	KnownBits RHSKnown(BitWidth);
	computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));

	if (RHSKnown.isNonNegative()) {
	// We know that the sign bit is zero.
	Known.makeNonNegative();
	}
	// assume(v >_s c) where c is at least -1.
	} else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) &&
	Pred == ICmpInst::ICMP_SGT &&
	isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
	KnownBits RHSKnown(BitWidth);
	computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));

	if (RHSKnown.isAllOnes() \|\| RHSKnown.isNonNegative()) {
	// We know that the sign bit is zero.
	Known.makeNonNegative();
	}
	// assume(v <=_s c) where c is negative
	} else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) &&
	Pred == ICmpInst::ICMP_SLE &&
	isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
	KnownBits RHSKnown(BitWidth);
	computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));

	if (RHSKnown.isNegative()) {
	// We know that the sign bit is one.
	Known.makeNegative();
	}
	// assume(v <_s c) where c is non-positive
	} else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) &&
	Pred == ICmpInst::ICMP_SLT &&
	isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
	KnownBits RHSKnown(BitWidth);
	computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));

	if (RHSKnown.isZero() \|\| RHSKnown.isNegative()) {
	// We know that the sign bit is one.
	Known.makeNegative();
	}
	// assume(v <=_u c)
	} else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) &&
	Pred == ICmpInst::ICMP_ULE &&
	isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
	KnownBits RHSKnown(BitWidth);
	computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));

	// Whatever high bits in c are zero are known to be zero.
	Known.Zero.setHighBits(RHSKnown.countMinLeadingZeros());
	// assume(v <_u c)
	} else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) &&
	Pred == ICmpInst::ICMP_ULT &&
	isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
	KnownBits RHSKnown(BitWidth);
	computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));

	// Whatever high bits in c are zero are known to be zero (if c is a power
	// of 2, then one more).
	if (isKnownToBeAPowerOfTwo(A, false, Depth + 1, Query(Q, I)))
	Known.Zero.setHighBits(RHSKnown.countMinLeadingZeros() + 1);
	else
	Known.Zero.setHighBits(RHSKnown.countMinLeadingZeros());
	}
	}

	// If assumptions conflict with each other or previous known bits, then we
	// have a logical fallacy. It's possible that the assumption is not reachable,
	// so this isn't a real bug. On the other hand, the program may have undefined
	// behavior, or we might have a bug in the compiler. We can't assert/crash, so
	// clear out the known bits, try to warn the user, and hope for the best.
	if (Known.Zero.intersects(Known.One)) {
	Known.resetAll();

	if (Q.ORE) {
	auto CxtI = const_cast<Instruction >(Q.CxtI);
	OptimizationRemarkAnalysis ORA("value-tracking", "BadAssumption", CxtI);
	Q.ORE->emit(ORA << "Detected conflicting code assumptions. Program may "
	"have undefined behavior, or compiler may have "
	"internal error.");
	}
	}
	}

	// Compute known bits from a shift operator, including those with a
	// non-constant shift amount. Known is the outputs of this function. Known2 is a
	// pre-allocated temporary with the/ same bit width as Known. KZF and KOF are
	// operator-specific functors that, given the known-zero or known-one bits
	// respectively, and a shift amount, compute the implied known-zero or known-one
	// bits of the shift operator's result respectively for that shift amount. The
	// results from calling KZF and KOF are conservatively combined for all
	// permitted shift amounts.
	static void computeKnownBitsFromShiftOperator(
	const Operator *I, KnownBits &Known, KnownBits &Known2,
	unsigned Depth, const Query &Q,
	function_ref<APInt(const APInt &, unsigned)> KZF,
	function_ref<APInt(const APInt &, unsigned)> KOF) {
	unsigned BitWidth = Known.getBitWidth();

	if (auto *SA = dyn_cast<ConstantInt>(I->getOperand(1))) {
	unsigned ShiftAmt = SA->getLimitedValue(BitWidth-1);

	computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
	Known.Zero = KZF(Known.Zero, ShiftAmt);
	Known.One = KOF(Known.One, ShiftAmt);
	// If there is conflict between Known.Zero and Known.One, this must be an
	// overflowing left shift, so the shift result is undefined. Clear Known
	// bits so that other code could propagate this undef.
	if ((Known.Zero & Known.One) != 0)
	Known.resetAll();

	return;
	}

	computeKnownBits(I->getOperand(1), Known, Depth + 1, Q);

	// If the shift amount could be greater than or equal to the bit-width of the LHS, the
	// value could be undef, so we don't know anything about it.
	if ((~Known.Zero).uge(BitWidth)) {
	Known.resetAll();
	return;
	}

	// Note: We cannot use Known.Zero.getLimitedValue() here, because if
	// BitWidth > 64 and any upper bits are known, we'll end up returning the
	// limit value (which implies all bits are known).
	uint64_t ShiftAmtKZ = Known.Zero.zextOrTrunc(64).getZExtValue();
	uint64_t ShiftAmtKO = Known.One.zextOrTrunc(64).getZExtValue();

	// It would be more-clearly correct to use the two temporaries for this
	// calculation. Reusing the APInts here to prevent unnecessary allocations.
	Known.resetAll();

	// If we know the shifter operand is nonzero, we can sometimes infer more
	// known bits. However this is expensive to compute, so be lazy about it and
	// only compute it when absolutely necessary.
	Optional<bool> ShifterOperandIsNonZero;

	// Early exit if we can't constrain any well-defined shift amount.
	if (!(ShiftAmtKZ & (PowerOf2Ceil(BitWidth) - 1)) &&
	!(ShiftAmtKO & (PowerOf2Ceil(BitWidth) - 1))) {
	ShifterOperandIsNonZero =
	isKnownNonZero(I->getOperand(1), Depth + 1, Q);
	if (!*ShifterOperandIsNonZero)
	return;
	}

	computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);

	Known.Zero.setAllBits();
	Known.One.setAllBits();
	for (unsigned ShiftAmt = 0; ShiftAmt < BitWidth; ++ShiftAmt) {
	// Combine the shifted known input bits only for those shift amounts
	// compatible with its known constraints.
	if ((ShiftAmt & ~ShiftAmtKZ) != ShiftAmt)
	continue;
	if ((ShiftAmt \| ShiftAmtKO) != ShiftAmt)
	continue;
	// If we know the shifter is nonzero, we may be able to infer more known
	// bits. This check is sunk down as far as possible to avoid the expensive
	// call to isKnownNonZero if the cheaper checks above fail.
	if (ShiftAmt == 0) {
	if (!ShifterOperandIsNonZero.hasValue())
	ShifterOperandIsNonZero =
	isKnownNonZero(I->getOperand(1), Depth + 1, Q);
	if (*ShifterOperandIsNonZero)
	continue;
	}

	Known.Zero &= KZF(Known2.Zero, ShiftAmt);
	Known.One &= KOF(Known2.One, ShiftAmt);
	}

	// If there are no compatible shift amounts, then we've proven that the shift
	// amount must be >= the BitWidth, and the result is undefined. We could
	// return anything we'd like, but we need to make sure the sets of known bits
	// stay disjoint (it should be better for some other code to actually
	// propagate the undef than to pick a value here using known bits).
	if (Known.Zero.intersects(Known.One))
	Known.resetAll();
	}

	static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
	unsigned Depth, const Query &Q) {
	unsigned BitWidth = Known.getBitWidth();

	KnownBits Known2(Known);
	switch (I->getOpcode()) {
	default: break;
	case Instruction::Load:
	if (MDNode *MD = cast<LoadInst>(I)->getMetadata(LLVMContext::MD_range))
	computeKnownBitsFromRangeMetadata(*MD, Known);
	break;
	case Instruction::And: {
	// If either the LHS or the RHS are Zero, the result is zero.
	computeKnownBits(I->getOperand(1), Known, Depth + 1, Q);
	computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);

	// Output known-1 bits are only known if set in both the LHS & RHS.
	Known.One &= Known2.One;
	// Output known-0 are known to be clear if zero in either the LHS \| RHS.
	Known.Zero \|= Known2.Zero;

	// and(x, add (x, -1)) is a common idiom that always clears the low bit;
	// here we handle the more general case of adding any odd number by
	// matching the form add(x, add(x, y)) where y is odd.
	// TODO: This could be generalized to clearing any bit set in y where the
	// following bit is known to be unset in y.
	Value *Y = nullptr;
	if (!Known.Zero[0] && !Known.One[0] &&
	(match(I->getOperand(0), m_Add(m_Specific(I->getOperand(1)),
	m_Value(Y))) \|\|
	match(I->getOperand(1), m_Add(m_Specific(I->getOperand(0)),
	m_Value(Y))))) {
	Known2.resetAll();
	computeKnownBits(Y, Known2, Depth + 1, Q);
	if (Known2.countMinTrailingOnes() > 0)
	Known.Zero.setBit(0);
	}
	break;
	}
	case Instruction::Or: {
	computeKnownBits(I->getOperand(1), Known, Depth + 1, Q);
	computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);

	// Output known-0 bits are only known if clear in both the LHS & RHS.
	Known.Zero &= Known2.Zero;
	// Output known-1 are known to be set if set in either the LHS \| RHS.
	Known.One \|= Known2.One;
	break;
	}
	case Instruction::Xor: {
	computeKnownBits(I->getOperand(1), Known, Depth + 1, Q);
	computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);

	// Output known-0 bits are known if clear or set in both the LHS & RHS.
	APInt KnownZeroOut = (Known.Zero & Known2.Zero) \| (Known.One & Known2.One);
	// Output known-1 are known to be set if set in only one of the LHS, RHS.
	Known.One = (Known.Zero & Known2.One) \| (Known.One & Known2.Zero);
	Known.Zero = std::move(KnownZeroOut);
	break;
	}
	case Instruction::Mul: {
	bool NSW = cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap();
	computeKnownBitsMul(I->getOperand(0), I->getOperand(1), NSW, Known,
	Known2, Depth, Q);
	break;
	}
	case Instruction::UDiv: {
	// For the purposes of computing leading zeros we can conservatively
	// treat a udiv as a logical right shift by the power of 2 known to
	// be less than the denominator.
	computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
	unsigned LeadZ = Known2.countMinLeadingZeros();

	Known2.resetAll();
	computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q);
	unsigned RHSMaxLeadingZeros = Known2.countMaxLeadingZeros();
	if (RHSMaxLeadingZeros != BitWidth)
	LeadZ = std::min(BitWidth, LeadZ + BitWidth - RHSMaxLeadingZeros - 1);

	Known.Zero.setHighBits(LeadZ);
	break;
	}
	case Instruction::Select: {
	const Value LHS, RHS;
	SelectPatternFlavor SPF = matchSelectPattern(I, LHS, RHS).Flavor;
	if (SelectPatternResult::isMinOrMax(SPF)) {
	computeKnownBits(RHS, Known, Depth + 1, Q);
	computeKnownBits(LHS, Known2, Depth + 1, Q);
	} else {
	computeKnownBits(I->getOperand(2), Known, Depth + 1, Q);
	computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q);
	}

	unsigned MaxHighOnes = 0;
	unsigned MaxHighZeros = 0;
	if (SPF == SPF_SMAX) {
	// If both sides are negative, the result is negative.
	if (Known.isNegative() && Known2.isNegative())
	// We can derive a lower bound on the result by taking the max of the
	// leading one bits.
	MaxHighOnes =
	std::max(Known.countMinLeadingOnes(), Known2.countMinLeadingOnes());
	// If either side is non-negative, the result is non-negative.
	else if (Known.isNonNegative() \|\| Known2.isNonNegative())
	MaxHighZeros = 1;
	} else if (SPF == SPF_SMIN) {
	// If both sides are non-negative, the result is non-negative.
	if (Known.isNonNegative() && Known2.isNonNegative())
	// We can derive an upper bound on the result by taking the max of the
	// leading zero bits.
	MaxHighZeros = std::max(Known.countMinLeadingZeros(),
	Known2.countMinLeadingZeros());
	// If either side is negative, the result is negative.
	else if (Known.isNegative() \|\| Known2.isNegative())
	MaxHighOnes = 1;
	} else if (SPF == SPF_UMAX) {
	// We can derive a lower bound on the result by taking the max of the
	// leading one bits.
	MaxHighOnes =
	std::max(Known.countMinLeadingOnes(), Known2.countMinLeadingOnes());
	} else if (SPF == SPF_UMIN) {
	// We can derive an upper bound on the result by taking the max of the
	// leading zero bits.
	MaxHighZeros =
	std::max(Known.countMinLeadingZeros(), Known2.countMinLeadingZeros());
	}

	// Only known if known in both the LHS and RHS.
	Known.One &= Known2.One;
	Known.Zero &= Known2.Zero;
	if (MaxHighOnes > 0)
	Known.One.setHighBits(MaxHighOnes);
	if (MaxHighZeros > 0)
	Known.Zero.setHighBits(MaxHighZeros);
	break;
	}
	case Instruction::FPTrunc:
	case Instruction::FPExt:
	case Instruction::FPToUI:
	case Instruction::FPToSI:
	case Instruction::SIToFP:
	case Instruction::UIToFP:
	break; // Can't work with floating point.
	case Instruction::PtrToInt:
	case Instruction::IntToPtr:
	// Fall through and handle them the same as zext/trunc.
	LLVM_FALLTHROUGH;
	case Instruction::ZExt:
	case Instruction::Trunc: {
	Type *SrcTy = I->getOperand(0)->getType();

	unsigned SrcBitWidth;
	// Note that we handle pointer operands here because of inttoptr/ptrtoint
	// which fall through here.
	SrcBitWidth = Q.DL.getTypeSizeInBits(SrcTy->getScalarType());

	assert(SrcBitWidth && "SrcBitWidth can't be zero");
	Known = Known.zextOrTrunc(SrcBitWidth);
	computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
	Known = Known.zextOrTrunc(BitWidth);
	// Any top bits are known to be zero.
	if (BitWidth > SrcBitWidth)
	Known.Zero.setBitsFrom(SrcBitWidth);
	break;
	}
	case Instruction::BitCast: {
	Type *SrcTy = I->getOperand(0)->getType();
	if ((SrcTy->isIntegerTy() \|\| SrcTy->isPointerTy()) &&
	// TODO: For now, not handling conversions like:
	// (bitcast i64 %x to <2 x i32>)
	!I->getType()->isVectorTy()) {
	computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
	break;
	}
	break;
	}
	case Instruction::SExt: {
	// Compute the bits in the result that are not present in the input.
	unsigned SrcBitWidth = I->getOperand(0)->getType()->getScalarSizeInBits();

	Known = Known.trunc(SrcBitWidth);
	computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
	// If the sign bit of the input is known set or clear, then we know the
	// top bits of the result.
	Known = Known.sext(BitWidth);
	break;
	}
	case Instruction::Shl: {
	// (shl X, C1) & C2 == 0 iff (X & C2 >>u C1) == 0
	bool NSW = cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap();
	auto KZF = [NSW](const APInt &KnownZero, unsigned ShiftAmt) {
	APInt KZResult = KnownZero << ShiftAmt;
	KZResult.setLowBits(ShiftAmt); // Low bits known 0.
	// If this shift has "nsw" keyword, then the result is either a poison
	// value or has the same sign bit as the first operand.
	if (NSW && KnownZero.isSignBitSet())
	KZResult.setSignBit();
	return KZResult;
	};

	auto KOF = [NSW](const APInt &KnownOne, unsigned ShiftAmt) {
	APInt KOResult = KnownOne << ShiftAmt;
	if (NSW && KnownOne.isSignBitSet())
	KOResult.setSignBit();
	return KOResult;
	};

	computeKnownBitsFromShiftOperator(I, Known, Known2, Depth, Q, KZF, KOF);
	break;
	}
	case Instruction::LShr: {
	// (ushr X, C1) & C2 == 0 iff (-1 >> C1) & C2 == 0
	auto KZF = [](const APInt &KnownZero, unsigned ShiftAmt) {
	APInt KZResult = KnownZero.lshr(ShiftAmt);
	// High bits known zero.
	KZResult.setHighBits(ShiftAmt);
	return KZResult;
	};

	auto KOF = [](const APInt &KnownOne, unsigned ShiftAmt) {
	return KnownOne.lshr(ShiftAmt);
	};

	computeKnownBitsFromShiftOperator(I, Known, Known2, Depth, Q, KZF, KOF);
	break;
	}
	case Instruction::AShr: {
	// (ashr X, C1) & C2 == 0 iff (-1 >> C1) & C2 == 0
	auto KZF = [](const APInt &KnownZero, unsigned ShiftAmt) {
	return KnownZero.ashr(ShiftAmt);
	};

	auto KOF = [](const APInt &KnownOne, unsigned ShiftAmt) {
	return KnownOne.ashr(ShiftAmt);
	};

	computeKnownBitsFromShiftOperator(I, Known, Known2, Depth, Q, KZF, KOF);
	break;
	}
	case Instruction::Sub: {
	bool NSW = cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap();
	computeKnownBitsAddSub(false, I->getOperand(0), I->getOperand(1), NSW,
	Known, Known2, Depth, Q);
	break;
	}
	case Instruction::Add: {
	bool NSW = cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap();
	computeKnownBitsAddSub(true, I->getOperand(0), I->getOperand(1), NSW,
	Known, Known2, Depth, Q);
	break;
	}
	case Instruction::SRem:
	if (ConstantInt *Rem = dyn_cast<ConstantInt>(I->getOperand(1))) {
	APInt RA = Rem->getValue().abs();
	if (RA.isPowerOf2()) {
	APInt LowBits = RA - 1;
	computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);

	// The low bits of the first operand are unchanged by the srem.
	Known.Zero = Known2.Zero & LowBits;
	Known.One = Known2.One & LowBits;

	// If the first operand is non-negative or has all low bits zero, then
	// the upper bits are all zero.
	if (Known2.isNonNegative() \|\| LowBits.isSubsetOf(Known2.Zero))
	Known.Zero \|= ~LowBits;

	// If the first operand is negative and not all low bits are zero, then
	// the upper bits are all one.
	if (Known2.isNegative() && LowBits.intersects(Known2.One))
	Known.One \|= ~LowBits;

	assert((Known.Zero & Known.One) == 0 && "Bits known to be one AND zero?");
	break;
	}
	}

	// The sign bit is the LHS's sign bit, except when the result of the
	// remainder is zero.
	computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
	// If it's known zero, our sign bit is also zero.
	if (Known2.isNonNegative())
	Known.makeNonNegative();

	break;
	case Instruction::URem: {
	if (ConstantInt *Rem = dyn_cast<ConstantInt>(I->getOperand(1))) {
	const APInt &RA = Rem->getValue();
	if (RA.isPowerOf2()) {
	APInt LowBits = (RA - 1);
	computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
	Known.Zero \|= ~LowBits;
	Known.One &= LowBits;
	break;
	}
	}

	// Since the result is less than or equal to either operand, any leading
	// zero bits in either operand must also exist in the result.
	computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
	computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q);

	unsigned Leaders =
	std::max(Known.countMinLeadingZeros(), Known2.countMinLeadingZeros());
	Known.resetAll();
	Known.Zero.setHighBits(Leaders);
	break;
	}

	case Instruction::Alloca: {
	const AllocaInst *AI = cast<AllocaInst>(I);
	unsigned Align = AI->getAlignment();
	if (Align == 0)
	Align = Q.DL.getABITypeAlignment(AI->getAllocatedType());

	if (Align > 0)
	Known.Zero.setLowBits(countTrailingZeros(Align));
	break;
	}
	case Instruction::GetElementPtr: {
	// Analyze all of the subscripts of this getelementptr instruction
	// to determine if we can prove known low zero bits.
	KnownBits LocalKnown(BitWidth);
	computeKnownBits(I->getOperand(0), LocalKnown, Depth + 1, Q);
	unsigned TrailZ = LocalKnown.countMinTrailingZeros();

	gep_type_iterator GTI = gep_type_begin(I);
	for (unsigned i = 1, e = I->getNumOperands(); i != e; ++i, ++GTI) {
	Value *Index = I->getOperand(i);
	if (StructType *STy = GTI.getStructTypeOrNull()) {
	// Handle struct member offset arithmetic.

	// Handle case when index is vector zeroinitializer
	Constant *CIndex = cast<Constant>(Index);
	if (CIndex->isZeroValue())
	continue;

	if (CIndex->getType()->isVectorTy())
	Index = CIndex->getSplatValue();

	unsigned Idx = cast<ConstantInt>(Index)->getZExtValue();
	const StructLayout *SL = Q.DL.getStructLayout(STy);
	uint64_t Offset = SL->getElementOffset(Idx);
	TrailZ = std::min<unsigned>(TrailZ,
	countTrailingZeros(Offset));
	} else {
	// Handle array index arithmetic.
	Type *IndexedTy = GTI.getIndexedType();
	if (!IndexedTy->isSized()) {
	TrailZ = 0;
	break;
	}
	unsigned GEPOpiBits = Index->getType()->getScalarSizeInBits();
	uint64_t TypeSize = Q.DL.getTypeAllocSize(IndexedTy);
	LocalKnown.Zero = LocalKnown.One = APInt(GEPOpiBits, 0);
	computeKnownBits(Index, LocalKnown, Depth + 1, Q);
	TrailZ = std::min(TrailZ,
	unsigned(countTrailingZeros(TypeSize) +
	LocalKnown.countMinTrailingZeros()));
	}
	}

	Known.Zero.setLowBits(TrailZ);
	break;
	}
	case Instruction::PHI: {
	const PHINode *P = cast<PHINode>(I);
	// Handle the case of a simple two-predecessor recurrence PHI.
	// There's a lot more that could theoretically be done here, but
	// this is sufficient to catch some interesting cases.
	if (P->getNumIncomingValues() == 2) {
	for (unsigned i = 0; i != 2; ++i) {
	Value *L = P->getIncomingValue(i);
	Value *R = P->getIncomingValue(!i);
	Operator *LU = dyn_cast<Operator>(L);
	if (!LU)
	continue;
	unsigned Opcode = LU->getOpcode();
	// Check for operations that have the property that if
	// both their operands have low zero bits, the result
	// will have low zero bits.
	if (Opcode == Instruction::Add \|\|
	Opcode == Instruction::Sub \|\|
	Opcode == Instruction::And \|\|
	Opcode == Instruction::Or \|\|
	Opcode == Instruction::Mul) {
	Value *LL = LU->getOperand(0);
	Value *LR = LU->getOperand(1);
	// Find a recurrence.
	if (LL == I)
	L = LR;
	else if (LR == I)
	L = LL;
	else
	break;
	// Ok, we have a PHI of the form L op= R. Check for low
	// zero bits.
	computeKnownBits(R, Known2, Depth + 1, Q);

	// We need to take the minimum number of known bits
	KnownBits Known3(Known);
	computeKnownBits(L, Known3, Depth + 1, Q);

	Known.Zero.setLowBits(std::min(Known2.countMinTrailingZeros(),
	Known3.countMinTrailingZeros()));

	if (DontImproveNonNegativePhiBits)
	break;

	auto *OverflowOp = dyn_cast<OverflowingBinaryOperator>(LU);
	if (OverflowOp && OverflowOp->hasNoSignedWrap()) {
	// If initial value of recurrence is nonnegative, and we are adding
	// a nonnegative number with nsw, the result can only be nonnegative
	// or poison value regardless of the number of times we execute the
	// add in phi recurrence. If initial value is negative and we are
	// adding a negative number with nsw, the result can only be
	// negative or poison value. Similar arguments apply to sub and mul.
	//
	// (add non-negative, non-negative) --> non-negative
	// (add negative, negative) --> negative
	if (Opcode == Instruction::Add) {
	if (Known2.isNonNegative() && Known3.isNonNegative())
	Known.makeNonNegative();
	else if (Known2.isNegative() && Known3.isNegative())
	Known.makeNegative();
	}

	// (sub nsw non-negative, negative) --> non-negative
	// (sub nsw negative, non-negative) --> negative
	else if (Opcode == Instruction::Sub && LL == I) {
	if (Known2.isNonNegative() && Known3.isNegative())
	Known.makeNonNegative();
	else if (Known2.isNegative() && Known3.isNonNegative())
	Known.makeNegative();
	}

	// (mul nsw non-negative, non-negative) --> non-negative
	else if (Opcode == Instruction::Mul && Known2.isNonNegative() &&
	Known3.isNonNegative())
	Known.makeNonNegative();
	}

	break;
	}
	}
	}

	// Unreachable blocks may have zero-operand PHI nodes.
	if (P->getNumIncomingValues() == 0)
	break;

	// Otherwise take the unions of the known bit sets of the operands,
	// taking conservative care to avoid excessive recursion.
	if (Depth < MaxDepth - 1 && !Known.Zero && !Known.One) {
	// Skip if every incoming value references to ourself.
	if (dyn_cast_or_null<UndefValue>(P->hasConstantValue()))
	break;

	Known.Zero.setAllBits();
	Known.One.setAllBits();
	for (Value *IncValue : P->incoming_values()) {
	// Skip direct self references.
	if (IncValue == P) continue;

	Known2 = KnownBits(BitWidth);
	// Recurse, but cap the recursion to one level, because we don't
	// want to waste time spinning around in loops.
	computeKnownBits(IncValue, Known2, MaxDepth - 1, Q);
	Known.Zero &= Known2.Zero;
	Known.One &= Known2.One;
	// If all bits have been ruled out, there's no need to check
	// more operands.
	if (!Known.Zero && !Known.One)
	break;
	}
	}
	break;
	}
	case Instruction::Call:
	case Instruction::Invoke:
	// If range metadata is attached to this call, set known bits from that,
	// and then intersect with known bits based on other properties of the
	// function.
	if (MDNode *MD = cast<Instruction>(I)->getMetadata(LLVMContext::MD_range))
	computeKnownBitsFromRangeMetadata(*MD, Known);
	if (const Value *RV = ImmutableCallSite(I).getReturnedArgOperand()) {
	computeKnownBits(RV, Known2, Depth + 1, Q);
	Known.Zero \|= Known2.Zero;
	Known.One \|= Known2.One;
	}
	if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
	switch (II->getIntrinsicID()) {
	default: break;
	case Intrinsic::bitreverse:
	computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
	Known.Zero \|= Known2.Zero.reverseBits();
	Known.One \|= Known2.One.reverseBits();
	break;
	case Intrinsic::bswap:
	computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
	Known.Zero \|= Known2.Zero.byteSwap();
	Known.One \|= Known2.One.byteSwap();
	break;
	case Intrinsic::ctlz: {
	computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
	// If we have a known 1, its position is our upper bound.
	unsigned PossibleLZ = Known2.One.countLeadingZeros();
	// If this call is undefined for 0, the result will be less than 2^n.
	if (II->getArgOperand(1) == ConstantInt::getTrue(II->getContext()))
	PossibleLZ = std::min(PossibleLZ, BitWidth - 1);
	unsigned LowBits = Log2_32(PossibleLZ)+1;
	Known.Zero.setBitsFrom(LowBits);
	break;
	}
	case Intrinsic::cttz: {
	computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
	// If we have a known 1, its position is our upper bound.
	unsigned PossibleTZ = Known2.One.countTrailingZeros();
	// If this call is undefined for 0, the result will be less than 2^n.
	if (II->getArgOperand(1) == ConstantInt::getTrue(II->getContext()))
	PossibleTZ = std::min(PossibleTZ, BitWidth - 1);
	unsigned LowBits = Log2_32(PossibleTZ)+1;
	Known.Zero.setBitsFrom(LowBits);
	break;
	}
	case Intrinsic::ctpop: {
	computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
	// We can bound the space the count needs. Also, bits known to be zero
	// can't contribute to the population.
	unsigned BitsPossiblySet = Known2.countMaxPopulation();
	unsigned LowBits = Log2_32(BitsPossiblySet)+1;
	Known.Zero.setBitsFrom(LowBits);
	// TODO: we could bound KnownOne using the lower bound on the number
	// of bits which might be set provided by popcnt KnownOne2.
	break;
	}
	case Intrinsic::x86_sse42_crc32_64_64:
	Known.Zero.setBitsFrom(32);
	break;
	}
	}
	break;
	case Instruction::ExtractElement:
	// Look through extract element. At the moment we keep this simple and skip
	// tracking the specific element. But at least we might find information
	// valid for all elements of the vector (for example if vector is sign
	// extended, shifted, etc).
	computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
	break;
	case Instruction::ExtractValue:
	if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I->getOperand(0))) {
	const ExtractValueInst *EVI = cast<ExtractValueInst>(I);
	if (EVI->getNumIndices() != 1) break;
	if (EVI->getIndices()[0] == 0) {
	switch (II->getIntrinsicID()) {
	default: break;
	case Intrinsic::uadd_with_overflow:
	case Intrinsic::sadd_with_overflow:
	computeKnownBitsAddSub(true, II->getArgOperand(0),
	II->getArgOperand(1), false, Known, Known2,
	Depth, Q);
	break;
	case Intrinsic::usub_with_overflow:
	case Intrinsic::ssub_with_overflow:
	computeKnownBitsAddSub(false, II->getArgOperand(0),
	II->getArgOperand(1), false, Known, Known2,
	Depth, Q);
	break;
	case Intrinsic::umul_with_overflow:
	case Intrinsic::smul_with_overflow:
	computeKnownBitsMul(II->getArgOperand(0), II->getArgOperand(1), false,
	Known, Known2, Depth, Q);
	break;
	}
	}
	}
	}
	}

	/// Determine which bits of V are known to be either zero or one and return
	/// them.
	KnownBits computeKnownBits(const Value *V, unsigned Depth, const Query &Q) {
	KnownBits Known(getBitWidth(V->getType(), Q.DL));
	computeKnownBits(V, Known, Depth, Q);
	return Known;
	}

	/// Determine which bits of V are known to be either zero or one and return
	/// them in the Known bit set.
	///
	/// NOTE: we cannot consider 'undef' to be "IsZero" here. The problem is that
	/// we cannot optimize based on the assumption that it is zero without changing
	/// it to be an explicit zero. If we don't change it to zero, other code could
	/// optimized based on the contradictory assumption that it is non-zero.
	/// Because instcombine aggressively folds operations with undef args anyway,
	/// this won't lose us code quality.
	///
	/// This function is defined on values with integer type, values with pointer
	/// type, and vectors of integers. In the case
	/// where V is a vector, known zero, and known one values are the
	/// same width as the vector element, and the bit is set only if it is true
	/// for all of the elements in the vector.
	void computeKnownBits(const Value *V, KnownBits &Known, unsigned Depth,
	const Query &Q) {
	assert(V && "No Value?");
	assert(Depth <= MaxDepth && "Limit Search Depth");
	unsigned BitWidth = Known.getBitWidth();

	assert((V->getType()->isIntOrIntVectorTy(BitWidth) \|\|
	V->getType()->isPtrOrPtrVectorTy()) &&
	"Not integer or pointer type!");
	assert(Q.DL.getTypeSizeInBits(V->getType()->getScalarType()) == BitWidth &&
	"V and Known should have same BitWidth");
	(void)BitWidth;

	const APInt *C;
	if (match(V, m_APInt(C))) {
	// We know all of the bits for a scalar constant or a splat vector constant!
	Known.One = *C;
	Known.Zero = ~Known.One;
	return;
	}
	// Null and aggregate-zero are all-zeros.
	if (isa<ConstantPointerNull>(V) \|\| isa<ConstantAggregateZero>(V)) {
	Known.setAllZero();
	return;
	}
	// Handle a constant vector by taking the intersection of the known bits of
	// each element.
	if (const ConstantDataSequential *CDS = dyn_cast<ConstantDataSequential>(V)) {
	// We know that CDS must be a vector of integers. Take the intersection of
	// each element.
	Known.Zero.setAllBits(); Known.One.setAllBits();
	APInt Elt(BitWidth, 0);
	for (unsigned i = 0, e = CDS->getNumElements(); i != e; ++i) {
	Elt = CDS->getElementAsInteger(i);
	Known.Zero &= ~Elt;
	Known.One &= Elt;
	}
	return;
	}

	if (const auto *CV = dyn_cast<ConstantVector>(V)) {
	// We know that CV must be a vector of integers. Take the intersection of
	// each element.
	Known.Zero.setAllBits(); Known.One.setAllBits();
	APInt Elt(BitWidth, 0);
	for (unsigned i = 0, e = CV->getNumOperands(); i != e; ++i) {
	Constant *Element = CV->getAggregateElement(i);
	auto *ElementCI = dyn_cast_or_null<ConstantInt>(Element);
	if (!ElementCI) {
	Known.resetAll();
	return;
	}
	Elt = ElementCI->getValue();
	Known.Zero &= ~Elt;
	Known.One &= Elt;
	}
	return;
	}

	// Start out not knowing anything.
	Known.resetAll();

	// We can't imply anything about undefs.
	if (isa<UndefValue>(V))
	return;

	// There's no point in looking through other users of ConstantData for
	// assumptions. Confirm that we've handled them all.
	assert(!isa<ConstantData>(V) && "Unhandled constant data!");

	// Limit search depth.
	// All recursive calls that increase depth must come after this.
	if (Depth == MaxDepth)
	return;

	// A weak GlobalAlias is totally unknown. A non-weak GlobalAlias has
	// the bits of its aliasee.
	if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
	if (!GA->isInterposable())
	computeKnownBits(GA->getAliasee(), Known, Depth + 1, Q);
	return;
	}

	if (const Operator *I = dyn_cast<Operator>(V))
	computeKnownBitsFromOperator(I, Known, Depth, Q);

	// Aligned pointers have trailing zeros - refine Known.Zero set
	if (V->getType()->isPointerTy()) {
	unsigned Align = V->getPointerAlignment(Q.DL);
	if (Align)
	Known.Zero.setLowBits(countTrailingZeros(Align));
	}

	// computeKnownBitsFromAssume strictly refines Known.
	// Therefore, we run them after computeKnownBitsFromOperator.

	// Check whether a nearby assume intrinsic can determine some known bits.
	computeKnownBitsFromAssume(V, Known, Depth, Q);

	assert((Known.Zero & Known.One) == 0 && "Bits known to be one AND zero?");
	}

	/// Return true if the given value is known to have exactly one
	/// bit set when defined. For vectors return true if every element is known to
	/// be a power of two when defined. Supports values with integer or pointer
	/// types and vectors of integers.
	bool isKnownToBeAPowerOfTwo(const Value *V, bool OrZero, unsigned Depth,
	const Query &Q) {
	if (const Constant *C = dyn_cast<Constant>(V)) {
	if (C->isNullValue())
	return OrZero;

	const APInt *ConstIntOrConstSplatInt;
	if (match(C, m_APInt(ConstIntOrConstSplatInt)))
	return ConstIntOrConstSplatInt->isPowerOf2();
	}

	// 1 << X is clearly a power of two if the one is not shifted off the end. If
	// it is shifted off the end then the result is undefined.
	if (match(V, m_Shl(m_One(), m_Value())))
	return true;

	// (signmask) >>l X is clearly a power of two if the one is not shifted off
	// the bottom. If it is shifted off the bottom then the result is undefined.
	if (match(V, m_LShr(m_SignMask(), m_Value())))
	return true;

	// The remaining tests are all recursive, so bail out if we hit the limit.
	if (Depth++ == MaxDepth)
	return false;

	Value X = nullptr, Y = nullptr;
	// A shift left or a logical shift right of a power of two is a power of two
	// or zero.
	if (OrZero && (match(V, m_Shl(m_Value(X), m_Value())) \|\|
	match(V, m_LShr(m_Value(X), m_Value()))))
	return isKnownToBeAPowerOfTwo(X, /OrZero/ true, Depth, Q);

	if (const ZExtInst *ZI = dyn_cast<ZExtInst>(V))
	return isKnownToBeAPowerOfTwo(ZI->getOperand(0), OrZero, Depth, Q);

	if (const SelectInst *SI = dyn_cast<SelectInst>(V))
	return isKnownToBeAPowerOfTwo(SI->getTrueValue(), OrZero, Depth, Q) &&
	isKnownToBeAPowerOfTwo(SI->getFalseValue(), OrZero, Depth, Q);

	if (OrZero && match(V, m_And(m_Value(X), m_Value(Y)))) {
	// A power of two and'd with anything is a power of two or zero.
	if (isKnownToBeAPowerOfTwo(X, /OrZero/ true, Depth, Q) \|\|
	isKnownToBeAPowerOfTwo(Y, /OrZero/ true, Depth, Q))
	return true;
	// X & (-X) is always a power of two or zero.
	if (match(X, m_Neg(m_Specific(Y))) \|\| match(Y, m_Neg(m_Specific(X))))
	return true;
	return false;
	}

	// Adding a power-of-two or zero to the same power-of-two or zero yields
	// either the original power-of-two, a larger power-of-two or zero.
	if (match(V, m_Add(m_Value(X), m_Value(Y)))) {
	const OverflowingBinaryOperator *VOBO = cast<OverflowingBinaryOperator>(V);
	if (OrZero \|\| VOBO->hasNoUnsignedWrap() \|\| VOBO->hasNoSignedWrap()) {
	if (match(X, m_And(m_Specific(Y), m_Value())) \|\|
	match(X, m_And(m_Value(), m_Specific(Y))))
	if (isKnownToBeAPowerOfTwo(Y, OrZero, Depth, Q))
	return true;
	if (match(Y, m_And(m_Specific(X), m_Value())) \|\|
	match(Y, m_And(m_Value(), m_Specific(X))))
	if (isKnownToBeAPowerOfTwo(X, OrZero, Depth, Q))
	return true;

	unsigned BitWidth = V->getType()->getScalarSizeInBits();
	KnownBits LHSBits(BitWidth);
	computeKnownBits(X, LHSBits, Depth, Q);

	KnownBits RHSBits(BitWidth);
	computeKnownBits(Y, RHSBits, Depth, Q);
	// If i8 V is a power of two or zero:
	// ZeroBits: 1 1 1 0 1 1 1 1
	// ~ZeroBits: 0 0 0 1 0 0 0 0
	if ((~(LHSBits.Zero & RHSBits.Zero)).isPowerOf2())
	// If OrZero isn't set, we cannot give back a zero result.
	// Make sure either the LHS or RHS has a bit set.
	if (OrZero \|\| RHSBits.One.getBoolValue() \|\| LHSBits.One.getBoolValue())
	return true;
	}
	}

	// An exact divide or right shift can only shift off zero bits, so the result
	// is a power of two only if the first operand is a power of two and not
	// copying a sign bit (sdiv int_min, 2).
	if (match(V, m_Exact(m_LShr(m_Value(), m_Value()))) \|\|
	match(V, m_Exact(m_UDiv(m_Value(), m_Value())))) {
	return isKnownToBeAPowerOfTwo(cast<Operator>(V)->getOperand(0), OrZero,
	Depth, Q);
	}

	return false;
	}

	/// \brief Test whether a GEP's result is known to be non-null.
	///
	/// Uses properties inherent in a GEP to try to determine whether it is known
	/// to be non-null.
	///
	/// Currently this routine does not support vector GEPs.
	static bool isGEPKnownNonNull(const GEPOperator *GEP, unsigned Depth,
	const Query &Q) {
	if (!GEP->isInBounds() \|\| GEP->getPointerAddressSpace() != 0)
	return false;

	// FIXME: Support vector-GEPs.
	assert(GEP->getType()->isPointerTy() && "We only support plain pointer GEP");

	// If the base pointer is non-null, we cannot walk to a null address with an
	// inbounds GEP in address space zero.
	if (isKnownNonZero(GEP->getPointerOperand(), Depth, Q))
	return true;

	// Walk the GEP operands and see if any operand introduces a non-zero offset.
	// If so, then the GEP cannot produce a null pointer, as doing so would
	// inherently violate the inbounds contract within address space zero.
	for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
	GTI != GTE; ++GTI) {
	// Struct types are easy -- they must always be indexed by a constant.
	if (StructType *STy = GTI.getStructTypeOrNull()) {
	ConstantInt *OpC = cast<ConstantInt>(GTI.getOperand());
	unsigned ElementIdx = OpC->getZExtValue();
	const StructLayout *SL = Q.DL.getStructLayout(STy);
	uint64_t ElementOffset = SL->getElementOffset(ElementIdx);
	if (ElementOffset > 0)
	return true;
	continue;
	}

	// If we have a zero-sized type, the index doesn't matter. Keep looping.
	if (Q.DL.getTypeAllocSize(GTI.getIndexedType()) == 0)
	continue;

	// Fast path the constant operand case both for efficiency and so we don't
	// increment Depth when just zipping down an all-constant GEP.
	if (ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand())) {
	if (!OpC->isZero())
	return true;
	continue;
	}

	// We post-increment Depth here because while isKnownNonZero increments it
	// as well, when we pop back up that increment won't persist. We don't want
	// to recurse 10k times just because we have 10k GEP operands. We don't
	// bail completely out because we want to handle constant GEPs regardless
	// of depth.
	if (Depth++ >= MaxDepth)
	continue;

	if (isKnownNonZero(GTI.getOperand(), Depth, Q))
	return true;
	}

	return false;
	}

	/// Does the 'Range' metadata (which must be a valid MD_range operand list)
	/// ensure that the value it's attached to is never Value? 'RangeType' is
	/// is the type of the value described by the range.
	static bool rangeMetadataExcludesValue(const MDNode* Ranges, const APInt& Value) {
	const unsigned NumRanges = Ranges->getNumOperands() / 2;
	assert(NumRanges >= 1);
	for (unsigned i = 0; i < NumRanges; ++i) {
	ConstantInt *Lower =
	mdconst::extract<ConstantInt>(Ranges->getOperand(2 * i + 0));
	ConstantInt *Upper =
	mdconst::extract<ConstantInt>(Ranges->getOperand(2 * i + 1));
	ConstantRange Range(Lower->getValue(), Upper->getValue());
	if (Range.contains(Value))
	return false;
	}
	return true;
	}

	/// Return true if the given value is known to be non-zero when defined. For
	/// vectors, return true if every element is known to be non-zero when
	/// defined. For pointers, if the context instruction and dominator tree are
	/// specified, perform context-sensitive analysis and return true if the
	/// pointer couldn't possibly be null at the specified instruction.
	/// Supports values with integer or pointer type and vectors of integers.
	bool isKnownNonZero(const Value *V, unsigned Depth, const Query &Q) {
	if (auto *C = dyn_cast<Constant>(V)) {
	if (C->isNullValue())
	return false;
	if (isa<ConstantInt>(C))
	// Must be non-zero due to null test above.
	return true;

	// For constant vectors, check that all elements are undefined or known
	// non-zero to determine that the whole vector is known non-zero.
	if (auto *VecTy = dyn_cast<VectorType>(C->getType())) {
	for (unsigned i = 0, e = VecTy->getNumElements(); i != e; ++i) {
	Constant *Elt = C->getAggregateElement(i);
	if (!Elt \|\| Elt->isNullValue())
	return false;
	if (!isa<UndefValue>(Elt) && !isa<ConstantInt>(Elt))
	return false;
	}
	return true;
	}

	return false;
	}

	if (auto *I = dyn_cast<Instruction>(V)) {
	if (MDNode *Ranges = I->getMetadata(LLVMContext::MD_range)) {
	// If the possible ranges don't contain zero, then the value is
	// definitely non-zero.
	if (auto *Ty = dyn_cast<IntegerType>(V->getType())) {
	const APInt ZeroValue(Ty->getBitWidth(), 0);
	if (rangeMetadataExcludesValue(Ranges, ZeroValue))
	return true;
	}
	}
	}

	// The remaining tests are all recursive, so bail out if we hit the limit.
	if (Depth++ >= MaxDepth)
	return false;

	// Check for pointer simplifications.
	if (V->getType()->isPointerTy()) {
	if (isKnownNonNullAt(V, Q.CxtI, Q.DT))
	return true;
	if (const GEPOperator *GEP = dyn_cast<GEPOperator>(V))
	if (isGEPKnownNonNull(GEP, Depth, Q))
	return true;
	}

	unsigned BitWidth = getBitWidth(V->getType()->getScalarType(), Q.DL);

	// X \| Y != 0 if X != 0 or Y != 0.
	Value X = nullptr, Y = nullptr;
	if (match(V, m_Or(m_Value(X), m_Value(Y))))
	return isKnownNonZero(X, Depth, Q) \|\| isKnownNonZero(Y, Depth, Q);

	// ext X != 0 if X != 0.
	if (isa<SExtInst>(V) \|\| isa<ZExtInst>(V))
	return isKnownNonZero(cast<Instruction>(V)->getOperand(0), Depth, Q);

	// shl X, Y != 0 if X is odd. Note that the value of the shift is undefined
	// if the lowest bit is shifted off the end.
	if (match(V, m_Shl(m_Value(X), m_Value(Y)))) {
	// shl nuw can't remove any non-zero bits.
	const OverflowingBinaryOperator *BO = cast<OverflowingBinaryOperator>(V);
	if (BO->hasNoUnsignedWrap())
	return isKnownNonZero(X, Depth, Q);

	KnownBits Known(BitWidth);
	computeKnownBits(X, Known, Depth, Q);
	if (Known.One[0])
	return true;
	}
	// shr X, Y != 0 if X is negative. Note that the value of the shift is not
	// defined if the sign bit is shifted off the end.
	else if (match(V, m_Shr(m_Value(X), m_Value(Y)))) {
	// shr exact can only shift out zero bits.
	const PossiblyExactOperator *BO = cast<PossiblyExactOperator>(V);
	if (BO->isExact())
	return isKnownNonZero(X, Depth, Q);

	KnownBits Known = computeKnownBits(X, Depth, Q);
	if (Known.isNegative())
	return true;

	// If the shifter operand is a constant, and all of the bits shifted
	// out are known to be zero, and X is known non-zero then at least one
	// non-zero bit must remain.
	if (ConstantInt *Shift = dyn_cast<ConstantInt>(Y)) {
	auto ShiftVal = Shift->getLimitedValue(BitWidth - 1);
	// Is there a known one in the portion not shifted out?
	if (Known.countMaxLeadingZeros() < BitWidth - ShiftVal)
	return true;
	// Are all the bits to be shifted out known zero?
	if (Known.countMinTrailingZeros() >= ShiftVal)
	return isKnownNonZero(X, Depth, Q);
	}
	}
	// div exact can only produce a zero if the dividend is zero.
	else if (match(V, m_Exact(m_IDiv(m_Value(X), m_Value())))) {
	return isKnownNonZero(X, Depth, Q);
	}
	// X + Y.
	else if (match(V, m_Add(m_Value(X), m_Value(Y)))) {
	KnownBits XKnown = computeKnownBits(X, Depth, Q);
	KnownBits YKnown = computeKnownBits(Y, Depth, Q);

	// If X and Y are both non-negative (as signed values) then their sum is not
	// zero unless both X and Y are zero.
	if (XKnown.isNonNegative() && YKnown.isNonNegative())
	if (isKnownNonZero(X, Depth, Q) \|\| isKnownNonZero(Y, Depth, Q))
	return true;

	// If X and Y are both negative (as signed values) then their sum is not
	// zero unless both X and Y equal INT_MIN.
	if (XKnown.isNegative() && YKnown.isNegative()) {
	APInt Mask = APInt::getSignedMaxValue(BitWidth);
	// The sign bit of X is set. If some other bit is set then X is not equal
	// to INT_MIN.
	if (XKnown.One.intersects(Mask))
	return true;
	// The sign bit of Y is set. If some other bit is set then Y is not equal
	// to INT_MIN.
	if (YKnown.One.intersects(Mask))
	return true;
	}

	// The sum of a non-negative number and a power of two is not zero.
	if (XKnown.isNonNegative() &&
	isKnownToBeAPowerOfTwo(Y, /OrZero/ false, Depth, Q))
	return true;
	if (YKnown.isNonNegative() &&
	isKnownToBeAPowerOfTwo(X, /OrZero/ false, Depth, Q))
	return true;
	}
	// X * Y.
	else if (match(V, m_Mul(m_Value(X), m_Value(Y)))) {
	const OverflowingBinaryOperator *BO = cast<OverflowingBinaryOperator>(V);
	// If X and Y are non-zero then so is X * Y as long as the multiplication
	// does not overflow.
	if ((BO->hasNoSignedWrap() \|\| BO->hasNoUnsignedWrap()) &&
	isKnownNonZero(X, Depth, Q) && isKnownNonZero(Y, Depth, Q))
	return true;
	}
	// (C ? X : Y) != 0 if X != 0 and Y != 0.
	else if (const SelectInst *SI = dyn_cast<SelectInst>(V)) {
	if (isKnownNonZero(SI->getTrueValue(), Depth, Q) &&
	isKnownNonZero(SI->getFalseValue(), Depth, Q))
	return true;
	}
	// PHI
	else if (const PHINode *PN = dyn_cast<PHINode>(V)) {
	// Try and detect a recurrence that monotonically increases from a
	// starting value, as these are common as induction variables.
	if (PN->getNumIncomingValues() == 2) {
	Value *Start = PN->getIncomingValue(0);
	Value *Induction = PN->getIncomingValue(1);
	if (isa<ConstantInt>(Induction) && !isa<ConstantInt>(Start))
	std::swap(Start, Induction);
	if (ConstantInt *C = dyn_cast<ConstantInt>(Start)) {
	if (!C->isZero() && !C->isNegative()) {
	ConstantInt *X;
	if ((match(Induction, m_NSWAdd(m_Specific(PN), m_ConstantInt(X))) \|\|
	match(Induction, m_NUWAdd(m_Specific(PN), m_ConstantInt(X)))) &&
	!X->isNegative())
	return true;
	}
	}
	}
	// Check if all incoming values are non-zero constant.
	bool AllNonZeroConstants = all_of(PN->operands(), [](Value *V) {
	return isa<ConstantInt>(V) && !cast<ConstantInt>(V)->isZero();
	});
	if (AllNonZeroConstants)
	return true;
	}

	KnownBits Known(BitWidth);
	computeKnownBits(V, Known, Depth, Q);
	return Known.One != 0;
	}

	/// Return true if V2 == V1 + X, where X is known non-zero.
	static bool isAddOfNonZero(const Value V1, const Value V2, const Query &Q) {
	const BinaryOperator *BO = dyn_cast<BinaryOperator>(V1);
	if (!BO \|\| BO->getOpcode() != Instruction::Add)
	return false;
	Value *Op = nullptr;
	if (V2 == BO->getOperand(0))
	Op = BO->getOperand(1);
	else if (V2 == BO->getOperand(1))
	Op = BO->getOperand(0);
	else
	return false;
	return isKnownNonZero(Op, 0, Q);
	}

	/// Return true if it is known that V1 != V2.
	static bool isKnownNonEqual(const Value V1, const Value V2, const Query &Q) {
	if (V1 == V2)
	return false;
	if (V1->getType() != V2->getType())
	// We can't look through casts yet.
	return false;
	if (isAddOfNonZero(V1, V2, Q) \|\| isAddOfNonZero(V2, V1, Q))
	return true;

	if (V1->getType()->isIntOrIntVectorTy()) {
	// Are any known bits in V1 contradictory to known bits in V2? If V1
	// has a known zero where V2 has a known one, they must not be equal.
	KnownBits Known1 = computeKnownBits(V1, 0, Q);
	KnownBits Known2 = computeKnownBits(V2, 0, Q);

	if (Known1.Zero.intersects(Known2.One) \|\|
	Known2.Zero.intersects(Known1.One))
	return true;
	}
	return false;
	}

	/// Return true if 'V & Mask' is known to be zero. We use this predicate to
	/// simplify operations downstream. Mask is known to be zero for bits that V
	/// cannot have.
	///
	/// This function is defined on values with integer type, values with pointer
	/// type, and vectors of integers. In the case
	/// where V is a vector, the mask, known zero, and known one values are the
	/// same width as the vector element, and the bit is set only if it is true
	/// for all of the elements in the vector.
	bool MaskedValueIsZero(const Value *V, const APInt &Mask, unsigned Depth,
	const Query &Q) {
	KnownBits Known(Mask.getBitWidth());
	computeKnownBits(V, Known, Depth, Q);
	return Mask.isSubsetOf(Known.Zero);
	}

	/// For vector constants, loop over the elements and find the constant with the
	/// minimum number of sign bits. Return 0 if the value is not a vector constant
	/// or if any element was not analyzed; otherwise, return the count for the
	/// element with the minimum number of sign bits.
	static unsigned computeNumSignBitsVectorConstant(const Value *V,
	unsigned TyBits) {
	const auto *CV = dyn_cast<Constant>(V);
	if (!CV \|\| !CV->getType()->isVectorTy())
	return 0;

	unsigned MinSignBits = TyBits;
	unsigned NumElts = CV->getType()->getVectorNumElements();
	for (unsigned i = 0; i != NumElts; ++i) {
	// If we find a non-ConstantInt, bail out.
	auto *Elt = dyn_cast_or_null<ConstantInt>(CV->getAggregateElement(i));
	if (!Elt)
	return 0;

	// If the sign bit is 1, flip the bits, so we always count leading zeros.
	APInt EltVal = Elt->getValue();
	if (EltVal.isNegative())
	EltVal = ~EltVal;
	MinSignBits = std::min(MinSignBits, EltVal.countLeadingZeros());
	}

	return MinSignBits;
	}

	static unsigned ComputeNumSignBitsImpl(const Value *V, unsigned Depth,
	const Query &Q);

	static unsigned ComputeNumSignBits(const Value *V, unsigned Depth,
	const Query &Q) {
	unsigned Result = ComputeNumSignBitsImpl(V, Depth, Q);
	assert(Result > 0 && "At least one sign bit needs to be present!");
	return Result;
	}

	/// Return the number of times the sign bit of the register is replicated into
	/// the other bits. We know that at least 1 bit is always equal to the sign bit
	/// (itself), but other cases can give us information. For example, immediately
	/// after an "ashr X, 2", we know that the top 3 bits are all equal to each
	/// other, so we return 3. For vectors, return the number of sign bits for the
	/// vector element with the mininum number of known sign bits.
	static unsigned ComputeNumSignBitsImpl(const Value *V, unsigned Depth,
	const Query &Q) {

	// We return the minimum number of sign bits that are guaranteed to be present
	// in V, so for undef we have to conservatively return 1. We don't have the
	// same behavior for poison though -- that's a FIXME today.

	unsigned TyBits = Q.DL.getTypeSizeInBits(V->getType()->getScalarType());
	unsigned Tmp, Tmp2;
	unsigned FirstAnswer = 1;

	// Note that ConstantInt is handled by the general computeKnownBits case
	// below.

	if (Depth == MaxDepth)
	return 1; // Limit search depth.

	const Operator *U = dyn_cast<Operator>(V);
	switch (Operator::getOpcode(V)) {
	default: break;
	case Instruction::SExt:
	Tmp = TyBits - U->getOperand(0)->getType()->getScalarSizeInBits();
	return ComputeNumSignBits(U->getOperand(0), Depth + 1, Q) + Tmp;

	case Instruction::SDiv: {
	const APInt *Denominator;
	// sdiv X, C -> adds log(C) sign bits.
	if (match(U->getOperand(1), m_APInt(Denominator))) {

	// Ignore non-positive denominator.
	if (!Denominator->isStrictlyPositive())
	break;

	// Calculate the incoming numerator bits.
	unsigned NumBits = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);

	// Add floor(log(C)) bits to the numerator bits.
	return std::min(TyBits, NumBits + Denominator->logBase2());
	}
	break;
	}

	case Instruction::SRem: {
	const APInt *Denominator;
	// srem X, C -> we know that the result is within [-C+1,C) when C is a
	// positive constant. This let us put a lower bound on the number of sign
	// bits.
	if (match(U->getOperand(1), m_APInt(Denominator))) {

	// Ignore non-positive denominator.
	if (!Denominator->isStrictlyPositive())
	break;

	// Calculate the incoming numerator bits. SRem by a positive constant
	// can't lower the number of sign bits.
	unsigned NumrBits =
	ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);

	// Calculate the leading sign bit constraints by examining the
	// denominator. Given that the denominator is positive, there are two
	// cases:
	//
	// 1. the numerator is positive. The result range is [0,C) and [0,C) u<
	// (1 << ceilLogBase2(C)).
	//
	// 2. the numerator is negative. Then the result range is (-C,0] and
	// integers in (-C,0] are either 0 or >u (-1 << ceilLogBase2(C)).
	//
	// Thus a lower bound on the number of sign bits is `TyBits -
	// ceilLogBase2(C)`.

	unsigned ResBits = TyBits - Denominator->ceilLogBase2();
	return std::max(NumrBits, ResBits);
	}
	break;
	}

	case Instruction::AShr: {
	Tmp = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
	// ashr X, C -> adds C sign bits. Vectors too.
	const APInt *ShAmt;
	if (match(U->getOperand(1), m_APInt(ShAmt))) {
	unsigned ShAmtLimited = ShAmt->getZExtValue();
	if (ShAmtLimited >= TyBits)
	break; // Bad shift.
	Tmp += ShAmtLimited;
	if (Tmp > TyBits) Tmp = TyBits;
	}
	return Tmp;
	}
	case Instruction::Shl: {
	const APInt *ShAmt;
	if (match(U->getOperand(1), m_APInt(ShAmt))) {
	// shl destroys sign bits.
	Tmp = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
	Tmp2 = ShAmt->getZExtValue();
	if (Tmp2 >= TyBits \|\| // Bad shift.
	Tmp2 >= Tmp) break; // Shifted all sign bits out.
	return Tmp - Tmp2;
	}
	break;
	}
	case Instruction::And:
	case Instruction::Or:
	case Instruction::Xor: // NOT is handled here.
	// Logical binary ops preserve the number of sign bits at the worst.
	Tmp = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
	if (Tmp != 1) {
	Tmp2 = ComputeNumSignBits(U->getOperand(1), Depth + 1, Q);
	FirstAnswer = std::min(Tmp, Tmp2);
	// We computed what we know about the sign bits as our first
	// answer. Now proceed to the generic code that uses
	// computeKnownBits, and pick whichever answer is better.
	}
	break;

	case Instruction::Select:
	Tmp = ComputeNumSignBits(U->getOperand(1), Depth + 1, Q);
	if (Tmp == 1) return 1; // Early out.
	Tmp2 = ComputeNumSignBits(U->getOperand(2), Depth + 1, Q);
	return std::min(Tmp, Tmp2);

	case Instruction::Add:
	// Add can have at most one carry bit. Thus we know that the output
	// is, at worst, one more bit than the inputs.
	Tmp = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
	if (Tmp == 1) return 1; // Early out.

	// Special case decrementing a value (ADD X, -1):
	if (const auto *CRHS = dyn_cast<Constant>(U->getOperand(1)))
	if (CRHS->isAllOnesValue()) {
	KnownBits Known(TyBits);
	computeKnownBits(U->getOperand(0), Known, Depth + 1, Q);

	// If the input is known to be 0 or 1, the output is 0/-1, which is all
	// sign bits set.
	if ((Known.Zero \| 1).isAllOnesValue())
	return TyBits;

	// If we are subtracting one from a positive number, there is no carry
	// out of the result.
	if (Known.isNonNegative())
	return Tmp;
	}

	Tmp2 = ComputeNumSignBits(U->getOperand(1), Depth + 1, Q);
	if (Tmp2 == 1) return 1;
	return std::min(Tmp, Tmp2)-1;

	case Instruction::Sub:
	Tmp2 = ComputeNumSignBits(U->getOperand(1), Depth + 1, Q);
	if (Tmp2 == 1) return 1;

	// Handle NEG.
	if (const auto *CLHS = dyn_cast<Constant>(U->getOperand(0)))
	if (CLHS->isNullValue()) {
	KnownBits Known(TyBits);
	computeKnownBits(U->getOperand(1), Known, Depth + 1, Q);
	// If the input is known to be 0 or 1, the output is 0/-1, which is all
	// sign bits set.
	if ((Known.Zero \| 1).isAllOnesValue())
	return TyBits;

	// If the input is known to be positive (the sign bit is known clear),
	// the output of the NEG has the same number of sign bits as the input.
	if (Known.isNonNegative())
	return Tmp2;

	// Otherwise, we treat this like a SUB.
	}

	// Sub can have at most one carry bit. Thus we know that the output
	// is, at worst, one more bit than the inputs.
	Tmp = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
	if (Tmp == 1) return 1; // Early out.
	return std::min(Tmp, Tmp2)-1;

	case Instruction::PHI: {
	const PHINode *PN = cast<PHINode>(U);
	unsigned NumIncomingValues = PN->getNumIncomingValues();
	// Don't analyze large in-degree PHIs.
	if (NumIncomingValues > 4) break;
	// Unreachable blocks may have zero-operand PHI nodes.
	if (NumIncomingValues == 0) break;

	// Take the minimum of all incoming values. This can't infinitely loop
	// because of our depth threshold.
	Tmp = ComputeNumSignBits(PN->getIncomingValue(0), Depth + 1, Q);
	for (unsigned i = 1, e = NumIncomingValues; i != e; ++i) {
	if (Tmp == 1) return Tmp;
	Tmp = std::min(
	Tmp, ComputeNumSignBits(PN->getIncomingValue(i), Depth + 1, Q));
	}
	return Tmp;
	}

	case Instruction::Trunc:
	// FIXME: it's tricky to do anything useful for this, but it is an important
	// case for targets like X86.
	break;

	case Instruction::ExtractElement:
	// Look through extract element. At the moment we keep this simple and skip
	// tracking the specific element. But at least we might find information
	// valid for all elements of the vector (for example if vector is sign
	// extended, shifted, etc).
	return ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
	}

	// Finally, if we can prove that the top bits of the result are 0's or 1's,
	// use this information.

	// If we can examine all elements of a vector constant successfully, we're
	// done (we can't do any better than that). If not, keep trying.
	if (unsigned VecSignBits = computeNumSignBitsVectorConstant(V, TyBits))
	return VecSignBits;

	KnownBits Known(TyBits);
	computeKnownBits(V, Known, Depth, Q);

	// If we know that the sign bit is either zero or one, determine the number of
	// identical bits in the top of the input value.
	return std::max(FirstAnswer, Known.countMinSignBits());
	}

	/// This function computes the integer multiple of Base that equals V.
	/// If successful, it returns true and returns the multiple in
	/// Multiple. If unsuccessful, it returns false. It looks
	/// through SExt instructions only if LookThroughSExt is true.
	bool llvm::ComputeMultiple(Value V, unsigned Base, Value &Multiple,
	bool LookThroughSExt, unsigned Depth) {
	const unsigned MaxDepth = 6;

	assert(V && "No Value?");
	assert(Depth <= MaxDepth && "Limit Search Depth");
	assert(V->getType()->isIntegerTy() && "Not integer or pointer type!");

	Type *T = V->getType();

	ConstantInt *CI = dyn_cast<ConstantInt>(V);

	if (Base == 0)
	return false;

	if (Base == 1) {
	Multiple = V;
	return true;
	}

	ConstantExpr *CO = dyn_cast<ConstantExpr>(V);
	Constant *BaseVal = ConstantInt::get(T, Base);
	if (CO && CO == BaseVal) {
	// Multiple is 1.
	Multiple = ConstantInt::get(T, 1);
	return true;
	}

	if (CI && CI->getZExtValue() % Base == 0) {
	Multiple = ConstantInt::get(T, CI->getZExtValue() / Base);
	return true;
	}

	if (Depth == MaxDepth) return false; // Limit search depth.

	Operator *I = dyn_cast<Operator>(V);
	if (!I) return false;

	switch (I->getOpcode()) {
	default: break;
	case Instruction::SExt:
	if (!LookThroughSExt) return false;
	// otherwise fall through to ZExt
	LLVM_FALLTHROUGH;
	case Instruction::ZExt:
	return ComputeMultiple(I->getOperand(0), Base, Multiple,
	LookThroughSExt, Depth+1);
	case Instruction::Shl:
	case Instruction::Mul: {
	Value *Op0 = I->getOperand(0);
	Value *Op1 = I->getOperand(1);

	if (I->getOpcode() == Instruction::Shl) {
	ConstantInt *Op1CI = dyn_cast<ConstantInt>(Op1);
	if (!Op1CI) return false;
	// Turn Op0 << Op1 into Op0 * 2^Op1
	APInt Op1Int = Op1CI->getValue();
	uint64_t BitToSet = Op1Int.getLimitedValue(Op1Int.getBitWidth() - 1);
	APInt API(Op1Int.getBitWidth(), 0);
	API.setBit(BitToSet);
	Op1 = ConstantInt::get(V->getContext(), API);
	}

	Value *Mul0 = nullptr;
	if (ComputeMultiple(Op0, Base, Mul0, LookThroughSExt, Depth+1)) {
	if (Constant *Op1C = dyn_cast<Constant>(Op1))
	if (Constant *MulC = dyn_cast<Constant>(Mul0)) {
	if (Op1C->getType()->getPrimitiveSizeInBits() <
	MulC->getType()->getPrimitiveSizeInBits())
	Op1C = ConstantExpr::getZExt(Op1C, MulC->getType());
	if (Op1C->getType()->getPrimitiveSizeInBits() >
	MulC->getType()->getPrimitiveSizeInBits())
	MulC = ConstantExpr::getZExt(MulC, Op1C->getType());

	// V == Base * (Mul0 * Op1), so return (Mul0 * Op1)
	Multiple = ConstantExpr::getMul(MulC, Op1C);
	return true;
	}

	if (ConstantInt *Mul0CI = dyn_cast<ConstantInt>(Mul0))
	if (Mul0CI->getValue() == 1) {
	// V == Base * Op1, so return Op1
	Multiple = Op1;
	return true;
	}
	}

	Value *Mul1 = nullptr;
	if (ComputeMultiple(Op1, Base, Mul1, LookThroughSExt, Depth+1)) {
	if (Constant *Op0C = dyn_cast<Constant>(Op0))
	if (Constant *MulC = dyn_cast<Constant>(Mul1)) {
	if (Op0C->getType()->getPrimitiveSizeInBits() <
	MulC->getType()->getPrimitiveSizeInBits())
	Op0C = ConstantExpr::getZExt(Op0C, MulC->getType());
	if (Op0C->getType()->getPrimitiveSizeInBits() >
	MulC->getType()->getPrimitiveSizeInBits())
	MulC = ConstantExpr::getZExt(MulC, Op0C->getType());

	// V == Base * (Mul1 * Op0), so return (Mul1 * Op0)
	Multiple = ConstantExpr::getMul(MulC, Op0C);
	return true;
	}

	if (ConstantInt *Mul1CI = dyn_cast<ConstantInt>(Mul1))
	if (Mul1CI->getValue() == 1) {
	// V == Base * Op0, so return Op0
	Multiple = Op0;
	return true;
	}
	}
	}
	}

	// We could not determine if V is a multiple of Base.
	return false;
	}

	Intrinsic::ID llvm::getIntrinsicForCallSite(ImmutableCallSite ICS,
	const TargetLibraryInfo *TLI) {
	const Function *F = ICS.getCalledFunction();
	if (!F)
	return Intrinsic::not_intrinsic;

	if (F->isIntrinsic())
	return F->getIntrinsicID();

	if (!TLI)
	return Intrinsic::not_intrinsic;

	LibFunc Func;
	// We're going to make assumptions on the semantics of the functions, check
	// that the target knows that it's available in this environment and it does
	// not have local linkage.
	if (!F \|\| F->hasLocalLinkage() \|\| !TLI->getLibFunc(*F, Func))
	return Intrinsic::not_intrinsic;

	if (!ICS.onlyReadsMemory())
	return Intrinsic::not_intrinsic;

	// Otherwise check if we have a call to a function that can be turned into a
	// vector intrinsic.
	switch (Func) {
	default:
	break;
	case LibFunc_sin:
	case LibFunc_sinf:
	case LibFunc_sinl:
	return Intrinsic::sin;
	case LibFunc_cos:
	case LibFunc_cosf:
	case LibFunc_cosl:
	return Intrinsic::cos;
	case LibFunc_exp:
	case LibFunc_expf:
	case LibFunc_expl:
	return Intrinsic::exp;
	case LibFunc_exp2:
	case LibFunc_exp2f:
	case LibFunc_exp2l:
	return Intrinsic::exp2;
	case LibFunc_log:
	case LibFunc_logf:
	case LibFunc_logl:
	return Intrinsic::log;
	case LibFunc_log10:
	case LibFunc_log10f:
	case LibFunc_log10l:
	return Intrinsic::log10;
	case LibFunc_log2:
	case LibFunc_log2f:
	case LibFunc_log2l:
	return Intrinsic::log2;
	case LibFunc_fabs:
	case LibFunc_fabsf:
	case LibFunc_fabsl:
	return Intrinsic::fabs;
	case LibFunc_fmin:
	case LibFunc_fminf:
	case LibFunc_fminl:
	return Intrinsic::minnum;
	case LibFunc_fmax:
	case LibFunc_fmaxf:
	case LibFunc_fmaxl:
	return Intrinsic::maxnum;
	case LibFunc_copysign:
	case LibFunc_copysignf:
	case LibFunc_copysignl:
	return Intrinsic::copysign;
	case LibFunc_floor:
	case LibFunc_floorf:
	case LibFunc_floorl:
	return Intrinsic::floor;
	case LibFunc_ceil:
	case LibFunc_ceilf:
	case LibFunc_ceill:
	return Intrinsic::ceil;
	case LibFunc_trunc:
	case LibFunc_truncf:
	case LibFunc_truncl:
	return Intrinsic::trunc;
	case LibFunc_rint:
	case LibFunc_rintf:
	case LibFunc_rintl:
	return Intrinsic::rint;
	case LibFunc_nearbyint:
	case LibFunc_nearbyintf:
	case LibFunc_nearbyintl:
	return Intrinsic::nearbyint;
	case LibFunc_round:
	case LibFunc_roundf:
	case LibFunc_roundl:
	return Intrinsic::round;
	case LibFunc_pow:
	case LibFunc_powf:
	case LibFunc_powl:
	return Intrinsic::pow;
	case LibFunc_sqrt:
	case LibFunc_sqrtf:
	case LibFunc_sqrtl:
	if (ICS->hasNoNaNs())
	return Intrinsic::sqrt;
	return Intrinsic::not_intrinsic;
	}

	return Intrinsic::not_intrinsic;
	}

	/// Return true if we can prove that the specified FP value is never equal to
	/// -0.0.
	///
	/// NOTE: this function will need to be revisited when we support non-default
	/// rounding modes!
	///
	bool llvm::CannotBeNegativeZero(const Value V, const TargetLibraryInfo TLI,
	unsigned Depth) {
	if (const ConstantFP *CFP = dyn_cast<ConstantFP>(V))
	return !CFP->getValueAPF().isNegZero();

	if (Depth == MaxDepth)
	return false; // Limit search depth.

	const Operator *I = dyn_cast<Operator>(V);
	if (!I) return false;

	// Check if the nsz fast-math flag is set
	if (const FPMathOperator *FPO = dyn_cast<FPMathOperator>(I))
	if (FPO->hasNoSignedZeros())
	return true;

	// (add x, 0.0) is guaranteed to return +0.0, not -0.0.
	if (I->getOpcode() == Instruction::FAdd)
	if (ConstantFP *CFP = dyn_cast<ConstantFP>(I->getOperand(1)))
	if (CFP->isNullValue())
	return true;

	// sitofp and uitofp turn into +0.0 for zero.
	if (isa<SIToFPInst>(I) \|\| isa<UIToFPInst>(I))
	return true;

	if (const CallInst *CI = dyn_cast<CallInst>(I)) {
	Intrinsic::ID IID = getIntrinsicForCallSite(CI, TLI);
	switch (IID) {
	default:
	break;
	// sqrt(-0.0) = -0.0, no other negative results are possible.
	case Intrinsic::sqrt:
	return CannotBeNegativeZero(CI->getArgOperand(0), TLI, Depth + 1);
	// fabs(x) != -0.0
	case Intrinsic::fabs:
	return true;
	}
	}

	return false;
	}

	/// If \p SignBitOnly is true, test for a known 0 sign bit rather than a
	/// standard ordered compare. e.g. make -0.0 olt 0.0 be true because of the sign
	/// bit despite comparing equal.
	static bool cannotBeOrderedLessThanZeroImpl(const Value *V,
	const TargetLibraryInfo *TLI,
	bool SignBitOnly,
	unsigned Depth) {
	// TODO: This function does not do the right thing when SignBitOnly is true
	// and we're lowering to a hypothetical IEEE 754-compliant-but-evil platform
	// which flips the sign bits of NaNs. See
	// https://llvm.org/bugs/show_bug.cgi?id=31702.

	if (const ConstantFP *CFP = dyn_cast<ConstantFP>(V)) {
	return !CFP->getValueAPF().isNegative() \|\|
	(!SignBitOnly && CFP->getValueAPF().isZero());
	}

	if (Depth == MaxDepth)
	return false; // Limit search depth.

	const Operator *I = dyn_cast<Operator>(V);
	if (!I)
	return false;

	switch (I->getOpcode()) {
	default:
	break;
	// Unsigned integers are always nonnegative.
	case Instruction::UIToFP:
	return true;
	case Instruction::FMul:
	// x*x is always non-negative or a NaN.
	if (I->getOperand(0) == I->getOperand(1) &&
	(!SignBitOnly \|\| cast<FPMathOperator>(I)->hasNoNaNs()))
	return true;

	LLVM_FALLTHROUGH;
	case Instruction::FAdd:
	case Instruction::FDiv:
	case Instruction::FRem:
	return cannotBeOrderedLessThanZeroImpl(I->getOperand(0), TLI, SignBitOnly,
	Depth + 1) &&
	cannotBeOrderedLessThanZeroImpl(I->getOperand(1), TLI, SignBitOnly,
	Depth + 1);
	case Instruction::Select:
	return cannotBeOrderedLessThanZeroImpl(I->getOperand(1), TLI, SignBitOnly,
	Depth + 1) &&
	cannotBeOrderedLessThanZeroImpl(I->getOperand(2), TLI, SignBitOnly,
	Depth + 1);
	case Instruction::FPExt:
	case Instruction::FPTrunc:
	// Widening/narrowing never change sign.
	return cannotBeOrderedLessThanZeroImpl(I->getOperand(0), TLI, SignBitOnly,
	Depth + 1);
	case Instruction::Call:
	const auto *CI = cast<CallInst>(I);
	Intrinsic::ID IID = getIntrinsicForCallSite(CI, TLI);
	switch (IID) {
	default:
	break;
	case Intrinsic::maxnum:
	return cannotBeOrderedLessThanZeroImpl(I->getOperand(0), TLI, SignBitOnly,
	Depth + 1) \|\|
	cannotBeOrderedLessThanZeroImpl(I->getOperand(1), TLI, SignBitOnly,
	Depth + 1);
	case Intrinsic::minnum:
	return cannotBeOrderedLessThanZeroImpl(I->getOperand(0), TLI, SignBitOnly,
	Depth + 1) &&
	cannotBeOrderedLessThanZeroImpl(I->getOperand(1), TLI, SignBitOnly,
	Depth + 1);
	case Intrinsic::exp:
	case Intrinsic::exp2:
	case Intrinsic::fabs:
	return true;

	case Intrinsic::sqrt:
	// sqrt(x) is always >= -0 or NaN. Moreover, sqrt(x) == -0 iff x == -0.
	if (!SignBitOnly)
	return true;
	return CI->hasNoNaNs() && (CI->hasNoSignedZeros() \|\|
	CannotBeNegativeZero(CI->getOperand(0), TLI));

	case Intrinsic::powi:
	if (ConstantInt *Exponent = dyn_cast<ConstantInt>(I->getOperand(1))) {
	// powi(x,n) is non-negative if n is even.
	if (Exponent->getBitWidth() <= 64 && Exponent->getSExtValue() % 2u == 0)
	return true;
	}
	// TODO: This is not correct. Given that exp is an integer, here are the
	// ways that pow can return a negative value:
	//
	// pow(x, exp) --> negative if exp is odd and x is negative.
	// pow(-0, exp) --> -inf if exp is negative odd.
	// pow(-0, exp) --> -0 if exp is positive odd.
	// pow(-inf, exp) --> -0 if exp is negative odd.
	// pow(-inf, exp) --> -inf if exp is positive odd.
	//
	// Therefore, if !SignBitOnly, we can return true if x >= +0 or x is NaN,
	// but we must return false if x == -0. Unfortunately we do not currently
	// have a way of expressing this constraint. See details in
	// https://llvm.org/bugs/show_bug.cgi?id=31702.
	return cannotBeOrderedLessThanZeroImpl(I->getOperand(0), TLI, SignBitOnly,
	Depth + 1);

	case Intrinsic::fma:
	case Intrinsic::fmuladd:
	// x*x+y is non-negative if y is non-negative.
	return I->getOperand(0) == I->getOperand(1) &&
	(!SignBitOnly \|\| cast<FPMathOperator>(I)->hasNoNaNs()) &&
	cannotBeOrderedLessThanZeroImpl(I->getOperand(2), TLI, SignBitOnly,
	Depth + 1);
	}
	break;
	}
	return false;
	}

	bool llvm::CannotBeOrderedLessThanZero(const Value *V,
	const TargetLibraryInfo *TLI) {
	return cannotBeOrderedLessThanZeroImpl(V, TLI, false, 0);
	}

	bool llvm::SignBitMustBeZero(const Value V, const TargetLibraryInfo TLI) {
	return cannotBeOrderedLessThanZeroImpl(V, TLI, true, 0);
	}

	/// If the specified value can be set by repeating the same byte in memory,
	/// return the i8 value that it is represented with. This is
	/// true for all i8 values obviously, but is also true for i32 0, i32 -1,
	/// i16 0xF0F0, double 0.0 etc. If the value can't be handled with a repeated
	/// byte store (e.g. i16 0x1234), return null.
	Value llvm::isBytewiseValue(Value V) {
	// All byte-wide stores are splatable, even of arbitrary variables.
	if (V->getType()->isIntegerTy(8)) return V;

	// Handle 'null' ConstantArrayZero etc.
	if (Constant *C = dyn_cast<Constant>(V))
	if (C->isNullValue())
	return Constant::getNullValue(Type::getInt8Ty(V->getContext()));

	// Constant float and double values can be handled as integer values if the
	// corresponding integer value is "byteable". An important case is 0.0.
	if (ConstantFP *CFP = dyn_cast<ConstantFP>(V)) {
	if (CFP->getType()->isFloatTy())
	V = ConstantExpr::getBitCast(CFP, Type::getInt32Ty(V->getContext()));
	if (CFP->getType()->isDoubleTy())
	V = ConstantExpr::getBitCast(CFP, Type::getInt64Ty(V->getContext()));
	// Don't handle long double formats, which have strange constraints.
	}

	// We can handle constant integers that are multiple of 8 bits.
	if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
	if (CI->getBitWidth() % 8 == 0) {
	assert(CI->getBitWidth() > 8 && "8 bits should be handled above!");

	if (!CI->getValue().isSplat(8))
	return nullptr;
	return ConstantInt::get(V->getContext(), CI->getValue().trunc(8));
	}
	}

	// A ConstantDataArray/Vector is splatable if all its members are equal and
	// also splatable.
	if (ConstantDataSequential *CA = dyn_cast<ConstantDataSequential>(V)) {
	Value *Elt = CA->getElementAsConstant(0);
	Value *Val = isBytewiseValue(Elt);
	if (!Val)
	return nullptr;

	for (unsigned I = 1, E = CA->getNumElements(); I != E; ++I)
	if (CA->getElementAsConstant(I) != Elt)
	return nullptr;

	return Val;
	}

	// Conceptually, we could handle things like:
	// %a = zext i8 %X to i16
	// %b = shl i16 %a, 8
	// %c = or i16 %a, %b
	// but until there is an example that actually needs this, it doesn't seem
	// worth worrying about.
	return nullptr;
	}


	// This is the recursive version of BuildSubAggregate. It takes a few different
	// arguments. Idxs is the index within the nested struct From that we are
	// looking at now (which is of type IndexedType). IdxSkip is the number of
	// indices from Idxs that should be left out when inserting into the resulting
	// struct. To is the result struct built so far, new insertvalue instructions
	// build on that.
	static Value BuildSubAggregate(Value From, Value* To, Type *IndexedType,
	SmallVectorImpl<unsigned> &Idxs,
	unsigned IdxSkip,
	Instruction *InsertBefore) {
	llvm::StructType *STy = dyn_cast<llvm::StructType>(IndexedType);
	if (STy) {
	// Save the original To argument so we can modify it
	Value *OrigTo = To;
	// General case, the type indexed by Idxs is a struct
	for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
	// Process each struct element recursively
	Idxs.push_back(i);
	Value *PrevTo = To;
	To = BuildSubAggregate(From, To, STy->getElementType(i), Idxs, IdxSkip,
	InsertBefore);
	Idxs.pop_back();
	if (!To) {
	// Couldn't find any inserted value for this index? Cleanup
	while (PrevTo != OrigTo) {
	InsertValueInst* Del = cast<InsertValueInst>(PrevTo);
	PrevTo = Del->getAggregateOperand();
	Del->eraseFromParent();
	}
	// Stop processing elements
	break;
	}
	}
	// If we successfully found a value for each of our subaggregates
	if (To)
	return To;
	}
	// Base case, the type indexed by SourceIdxs is not a struct, or not all of
	// the struct's elements had a value that was inserted directly. In the latter
	// case, perhaps we can't determine each of the subelements individually, but
	// we might be able to find the complete struct somewhere.

	// Find the value that is at that particular spot
	Value *V = FindInsertedValue(From, Idxs);

	if (!V)
	return nullptr;

	// Insert the value in the new (sub) aggregrate
	return llvm::InsertValueInst::Create(To, V, makeArrayRef(Idxs).slice(IdxSkip),
	"tmp", InsertBefore);
	}

	// This helper takes a nested struct and extracts a part of it (which is again a
	// struct) into a new value. For example, given the struct:
	// { a, { b, { c, d }, e } }
	// and the indices "1, 1" this returns
	// { c, d }.
	//
	// It does this by inserting an insertvalue for each element in the resulting
	// struct, as opposed to just inserting a single struct. This will only work if
	// each of the elements of the substruct are known (ie, inserted into From by an
	// insertvalue instruction somewhere).
	//
	// All inserted insertvalue instructions are inserted before InsertBefore
	static Value BuildSubAggregate(Value From, ArrayRef<unsigned> idx_range,
	Instruction *InsertBefore) {
	assert(InsertBefore && "Must have someplace to insert!");
	Type *IndexedType = ExtractValueInst::getIndexedType(From->getType(),
	idx_range);
	Value *To = UndefValue::get(IndexedType);
	SmallVector<unsigned, 10> Idxs(idx_range.begin(), idx_range.end());
	unsigned IdxSkip = Idxs.size();

	return BuildSubAggregate(From, To, IndexedType, Idxs, IdxSkip, InsertBefore);
	}

	/// Given an aggregrate and an sequence of indices, see if
	/// the scalar value indexed is already around as a register, for example if it
	/// were inserted directly into the aggregrate.
	///
	/// If InsertBefore is not null, this function will duplicate (modified)
	/// insertvalues when a part of a nested struct is extracted.
	Value llvm::FindInsertedValue(Value V, ArrayRef<unsigned> idx_range,
	Instruction *InsertBefore) {
	// Nothing to index? Just return V then (this is useful at the end of our
	// recursion).
	if (idx_range.empty())
	return V;
	// We have indices, so V should have an indexable type.
	assert((V->getType()->isStructTy() \|\| V->getType()->isArrayTy()) &&
	"Not looking at a struct or array?");
	assert(ExtractValueInst::getIndexedType(V->getType(), idx_range) &&
	"Invalid indices for type?");

	if (Constant *C = dyn_cast<Constant>(V)) {
	C = C->getAggregateElement(idx_range[0]);
	if (!C) return nullptr;
	return FindInsertedValue(C, idx_range.slice(1), InsertBefore);
	}

	if (InsertValueInst *I = dyn_cast<InsertValueInst>(V)) {
	// Loop the indices for the insertvalue instruction in parallel with the
	// requested indices
	const unsigned *req_idx = idx_range.begin();
	for (const unsigned i = I->idx_begin(), e = I->idx_end();
	i != e; ++i, ++req_idx) {
	if (req_idx == idx_range.end()) {
	// We can't handle this without inserting insertvalues
	if (!InsertBefore)
	return nullptr;

	// The requested index identifies a part of a nested aggregate. Handle
	// this specially. For example,
	// %A = insertvalue { i32, {i32, i32 } } undef, i32 10, 1, 0
	// %B = insertvalue { i32, {i32, i32 } } %A, i32 11, 1, 1
	// %C = extractvalue {i32, { i32, i32 } } %B, 1
	// This can be changed into
	// %A = insertvalue {i32, i32 } undef, i32 10, 0
	// %C = insertvalue {i32, i32 } %A, i32 11, 1
	// which allows the unused 0,0 element from the nested struct to be
	// removed.
	return BuildSubAggregate(V, makeArrayRef(idx_range.begin(), req_idx),
	InsertBefore);
	}

	// This insert value inserts something else than what we are looking for.
	// See if the (aggregate) value inserted into has the value we are
	// looking for, then.
	if (req_idx != i)
	return FindInsertedValue(I->getAggregateOperand(), idx_range,
	InsertBefore);
	}
	// If we end up here, the indices of the insertvalue match with those
	// requested (though possibly only partially). Now we recursively look at
	// the inserted value, passing any remaining indices.
	return FindInsertedValue(I->getInsertedValueOperand(),
	makeArrayRef(req_idx, idx_range.end()),
	InsertBefore);
	}

	if (ExtractValueInst *I = dyn_cast<ExtractValueInst>(V)) {
	// If we're extracting a value from an aggregate that was extracted from
	// something else, we can extract from that something else directly instead.
	// However, we will need to chain I's indices with the requested indices.

	// Calculate the number of indices required
	unsigned size = I->getNumIndices() + idx_range.size();
	// Allocate some space to put the new indices in
	SmallVector<unsigned, 5> Idxs;
	Idxs.reserve(size);
	// Add indices from the extract value instruction
	Idxs.append(I->idx_begin(), I->idx_end());

	// Add requested indices
	Idxs.append(idx_range.begin(), idx_range.end());

	assert(Idxs.size() == size
	&& "Number of indices added not correct?");

	return FindInsertedValue(I->getAggregateOperand(), Idxs, InsertBefore);
	}
	// Otherwise, we don't know (such as, extracting from a function return value
	// or load instruction)
	return nullptr;
	}

	/// Analyze the specified pointer to see if it can be expressed as a base
	/// pointer plus a constant offset. Return the base and offset to the caller.
	Value llvm::GetPointerBaseWithConstantOffset(Value Ptr, int64_t &Offset,
	const DataLayout &DL) {
	unsigned BitWidth = DL.getPointerTypeSizeInBits(Ptr->getType());
	APInt ByteOffset(BitWidth, 0);

	// We walk up the defs but use a visited set to handle unreachable code. In
	// that case, we stop after accumulating the cycle once (not that it
	// matters).
	SmallPtrSet<Value *, 16> Visited;
	while (Visited.insert(Ptr).second) {
	if (Ptr->getType()->isVectorTy())
	break;

	if (GEPOperator *GEP = dyn_cast<GEPOperator>(Ptr)) {
	// If one of the values we have visited is an addrspacecast, then
	// the pointer type of this GEP may be different from the type
	// of the Ptr parameter which was passed to this function. This
	// means when we construct GEPOffset, we need to use the size
	// of GEP's pointer type rather than the size of the original
	// pointer type.
	APInt GEPOffset(DL.getPointerTypeSizeInBits(Ptr->getType()), 0);
	if (!GEP->accumulateConstantOffset(DL, GEPOffset))
	break;

	ByteOffset += GEPOffset.getSExtValue();

	Ptr = GEP->getPointerOperand();
	} else if (Operator::getOpcode(Ptr) == Instruction::BitCast \|\|
	Operator::getOpcode(Ptr) == Instruction::AddrSpaceCast) {
	Ptr = cast<Operator>(Ptr)->getOperand(0);
	} else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Ptr)) {
	if (GA->isInterposable())
	break;
	Ptr = GA->getAliasee();
	} else {
	break;
	}
	}
	Offset = ByteOffset.getSExtValue();
	return Ptr;
	}

	bool llvm::isGEPBasedOnPointerToString(const GEPOperator *GEP,
	unsigned CharSize) {
	// Make sure the GEP has exactly three arguments.
	if (GEP->getNumOperands() != 3)
	return false;

	// Make sure the index-ee is a pointer to array of \p CharSize integers.
	// CharSize.
	ArrayType *AT = dyn_cast<ArrayType>(GEP->getSourceElementType());
	if (!AT \|\| !AT->getElementType()->isIntegerTy(CharSize))
	return false;

	// Check to make sure that the first operand of the GEP is an integer and
	// has value 0 so that we are sure we're indexing into the initializer.
	const ConstantInt *FirstIdx = dyn_cast<ConstantInt>(GEP->getOperand(1));
	if (!FirstIdx \|\| !FirstIdx->isZero())
	return false;

	return true;
	}

	bool llvm::getConstantDataArrayInfo(const Value *V,
	ConstantDataArraySlice &Slice,
	unsigned ElementSize, uint64_t Offset) {
	assert(V);

	// Look through bitcast instructions and geps.
	V = V->stripPointerCasts();

	// If the value is a GEP instruction or constant expression, treat it as an
	// offset.
	if (const GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
	// The GEP operator should be based on a pointer to string constant, and is
	// indexing into the string constant.
	if (!isGEPBasedOnPointerToString(GEP, ElementSize))
	return false;

	// If the second index isn't a ConstantInt, then this is a variable index
	// into the array. If this occurs, we can't say anything meaningful about
	// the string.
	uint64_t StartIdx = 0;
	if (const ConstantInt *CI = dyn_cast<ConstantInt>(GEP->getOperand(2)))
	StartIdx = CI->getZExtValue();
	else
	return false;
	return getConstantDataArrayInfo(GEP->getOperand(0), Slice, ElementSize,
	StartIdx + Offset);
	}

	// The GEP instruction, constant or instruction, must reference a global
	// variable that is a constant and is initialized. The referenced constant
	// initializer is the array that we'll use for optimization.
	const GlobalVariable *GV = dyn_cast<GlobalVariable>(V);
	if (!GV \|\| !GV->isConstant() \|\| !GV->hasDefinitiveInitializer())
	return false;

	const ConstantDataArray *Array;
	ArrayType *ArrayTy;
	if (GV->getInitializer()->isNullValue()) {
	Type *GVTy = GV->getValueType();
	if ( (ArrayTy = dyn_cast<ArrayType>(GVTy)) ) {
	// A zeroinitializer for the array; there is no ConstantDataArray.
	Array = nullptr;
	} else {
	const DataLayout &DL = GV->getParent()->getDataLayout();
	uint64_t SizeInBytes = DL.getTypeStoreSize(GVTy);
	uint64_t Length = SizeInBytes / (ElementSize / 8);
	if (Length <= Offset)
	return false;

	Slice.Array = nullptr;
	Slice.Offset = 0;
	Slice.Length = Length - Offset;
	return true;
	}
	} else {
	// This must be a ConstantDataArray.
	Array = dyn_cast<ConstantDataArray>(GV->getInitializer());
	if (!Array)
	return false;
	ArrayTy = Array->getType();
	}
	if (!ArrayTy->getElementType()->isIntegerTy(ElementSize))
	return false;

	uint64_t NumElts = ArrayTy->getArrayNumElements();
	if (Offset > NumElts)
	return false;

	Slice.Array = Array;
	Slice.Offset = Offset;
	Slice.Length = NumElts - Offset;
	return true;
	}

	/// This function computes the length of a null-terminated C string pointed to
	/// by V. If successful, it returns true and returns the string in Str.
	/// If unsuccessful, it returns false.
	bool llvm::getConstantStringInfo(const Value *V, StringRef &Str,
	uint64_t Offset, bool TrimAtNul) {
	ConstantDataArraySlice Slice;
	if (!getConstantDataArrayInfo(V, Slice, 8, Offset))
	return false;

	if (Slice.Array == nullptr) {
	if (TrimAtNul) {
	Str = StringRef();
	return true;
	}
	if (Slice.Length == 1) {
	Str = StringRef("", 1);
	return true;
	}
	// We cannot instantiate a StringRef as we do not have an appropriate string
	// of 0s at hand.
	return false;
	}

	// Start out with the entire array in the StringRef.
	Str = Slice.Array->getAsString();
	// Skip over 'offset' bytes.
	Str = Str.substr(Slice.Offset);

	if (TrimAtNul) {
	// Trim off the \0 and anything after it. If the array is not nul
	// terminated, we just return the whole end of string. The client may know
	// some other way that the string is length-bound.
	Str = Str.substr(0, Str.find('\0'));
	}
	return true;
	}

	// These next two are very similar to the above, but also look through PHI
	// nodes.
	// TODO: See if we can integrate these two together.

	/// If we can compute the length of the string pointed to by
	/// the specified pointer, return 'len+1'. If we can't, return 0.
	static uint64_t GetStringLengthH(const Value *V,
	SmallPtrSetImpl<const PHINode*> &PHIs,
	unsigned CharSize) {
	// Look through noop bitcast instructions.
	V = V->stripPointerCasts();

	// If this is a PHI node, there are two cases: either we have already seen it
	// or we haven't.
	if (const PHINode *PN = dyn_cast<PHINode>(V)) {
	if (!PHIs.insert(PN).second)
	return ~0ULL; // already in the set.

	// If it was new, see if all the input strings are the same length.
	uint64_t LenSoFar = ~0ULL;
	for (Value *IncValue : PN->incoming_values()) {
	uint64_t Len = GetStringLengthH(IncValue, PHIs, CharSize);
	if (Len == 0) return 0; // Unknown length -> unknown.

	if (Len == ~0ULL) continue;

	if (Len != LenSoFar && LenSoFar != ~0ULL)
	return 0; // Disagree -> unknown.
	LenSoFar = Len;
	}

	// Success, all agree.
	return LenSoFar;
	}

	// strlen(select(c,x,y)) -> strlen(x) ^ strlen(y)
	if (const SelectInst *SI = dyn_cast<SelectInst>(V)) {
	uint64_t Len1 = GetStringLengthH(SI->getTrueValue(), PHIs, CharSize);
	if (Len1 == 0) return 0;
	uint64_t Len2 = GetStringLengthH(SI->getFalseValue(), PHIs, CharSize);
	if (Len2 == 0) return 0;
	if (Len1 == ~0ULL) return Len2;
	if (Len2 == ~0ULL) return Len1;
	if (Len1 != Len2) return 0;
	return Len1;
	}

	// Otherwise, see if we can read the string.
	ConstantDataArraySlice Slice;
	if (!getConstantDataArrayInfo(V, Slice, CharSize))
	return 0;

	if (Slice.Array == nullptr)
	return 1;

	// Search for nul characters
	unsigned NullIndex = 0;
	for (unsigned E = Slice.Length; NullIndex < E; ++NullIndex) {
	if (Slice.Array->getElementAsInteger(Slice.Offset + NullIndex) == 0)
	break;
	}

	return NullIndex + 1;
	}

	/// If we can compute the length of the string pointed to by
	/// the specified pointer, return 'len+1'. If we can't, return 0.
	uint64_t llvm::GetStringLength(const Value *V, unsigned CharSize) {
	if (!V->getType()->isPointerTy()) return 0;

	SmallPtrSet<const PHINode*, 32> PHIs;
	uint64_t Len = GetStringLengthH(V, PHIs, CharSize);
	// If Len is ~0ULL, we had an infinite phi cycle: this is dead code, so return
	// an empty string as a length.
	return Len == ~0ULL ? 1 : Len;
	}

	/// \brief \p PN defines a loop-variant pointer to an object. Check if the
	/// previous iteration of the loop was referring to the same object as \p PN.
	static bool isSameUnderlyingObjectInLoop(const PHINode *PN,
	const LoopInfo *LI) {
	// Find the loop-defined value.
	Loop *L = LI->getLoopFor(PN->getParent());
	if (PN->getNumIncomingValues() != 2)
	return true;

	// Find the value from previous iteration.
	auto *PrevValue = dyn_cast<Instruction>(PN->getIncomingValue(0));
	if (!PrevValue \|\| LI->getLoopFor(PrevValue->getParent()) != L)
	PrevValue = dyn_cast<Instruction>(PN->getIncomingValue(1));
	if (!PrevValue \|\| LI->getLoopFor(PrevValue->getParent()) != L)
	return true;

	// If a new pointer is loaded in the loop, the pointer references a different
	// object in every iteration. E.g.:
	// for (i)
	// int *p = a[i];
	// ...
	if (auto *Load = dyn_cast<LoadInst>(PrevValue))
	if (!L->isLoopInvariant(Load->getPointerOperand()))
	return false;
	return true;
	}

	Value llvm::GetUnderlyingObject(Value V, const DataLayout &DL,
	unsigned MaxLookup) {
	if (!V->getType()->isPointerTy())
	return V;
	for (unsigned Count = 0; MaxLookup == 0 \|\| Count < MaxLookup; ++Count) {
	if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
	V = GEP->getPointerOperand();
	} else if (Operator::getOpcode(V) == Instruction::BitCast \|\|
	Operator::getOpcode(V) == Instruction::AddrSpaceCast) {
	V = cast<Operator>(V)->getOperand(0);
	} else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
	if (GA->isInterposable())
	return V;
	V = GA->getAliasee();
	} else if (isa<AllocaInst>(V)) {
	// An alloca can't be further simplified.
	return V;
	} else {
	if (auto CS = CallSite(V))
	if (Value *RV = CS.getReturnedArgOperand()) {
	V = RV;
	continue;
	}

	// See if InstructionSimplify knows any relevant tricks.
	if (Instruction *I = dyn_cast<Instruction>(V))
	// TODO: Acquire a DominatorTree and AssumptionCache and use them.
	if (Value *Simplified = SimplifyInstruction(I, {DL, I})) {
	V = Simplified;
	continue;
	}

	return V;
	}
	assert(V->getType()->isPointerTy() && "Unexpected operand type!");
	}
	return V;
	}

	void llvm::GetUnderlyingObjects(Value V, SmallVectorImpl<Value > &Objects,
	const DataLayout &DL, LoopInfo *LI,
	unsigned MaxLookup) {
	SmallPtrSet<Value *, 4> Visited;
	SmallVector<Value *, 4> Worklist;
	Worklist.push_back(V);
	do {
	Value *P = Worklist.pop_back_val();
	P = GetUnderlyingObject(P, DL, MaxLookup);

	if (!Visited.insert(P).second)
	continue;

	if (SelectInst *SI = dyn_cast<SelectInst>(P)) {
	Worklist.push_back(SI->getTrueValue());
	Worklist.push_back(SI->getFalseValue());
	continue;
	}

	if (PHINode *PN = dyn_cast<PHINode>(P)) {
	// If this PHI changes the underlying object in every iteration of the
	// loop, don't look through it. Consider:
	// int **A;
	// for (i) {
	// Prev = Curr; // Prev = PHI (Prev_0, Curr)
	// Curr = A[i];
	// Prev, Curr;
	//
	// Prev is tracking Curr one iteration behind so they refer to different
	// underlying objects.
	if (!LI \|\| !LI->isLoopHeader(PN->getParent()) \|\|
	isSameUnderlyingObjectInLoop(PN, LI))
	for (Value *IncValue : PN->incoming_values())
	Worklist.push_back(IncValue);
	continue;
	}

	Objects.push_back(P);
	} while (!Worklist.empty());
	}

	/// This is the function that does the work of looking through basic
	/// ptrtoint+arithmetic+inttoptr sequences.
	static const Value getUnderlyingObjectFromInt(const Value V) {
	do {
	if (const Operator *U = dyn_cast<Operator>(V)) {
	// If we find a ptrtoint, we can transfer control back to the
	// regular getUnderlyingObjectFromInt.
	if (U->getOpcode() == Instruction::PtrToInt)
	return U->getOperand(0);
	// If we find an add of a constant, a multiplied value, or a phi, it's
	// likely that the other operand will lead us to the base
	// object. We don't have to worry about the case where the
	// object address is somehow being computed by the multiply,
	// because our callers only care when the result is an
	// identifiable object.
	if (U->getOpcode() != Instruction::Add \|\|
	(!isa<ConstantInt>(U->getOperand(1)) &&
	Operator::getOpcode(U->getOperand(1)) != Instruction::Mul &&
	!isa<PHINode>(U->getOperand(1))))
	return V;
	V = U->getOperand(0);
	} else {
	return V;
	}
	assert(V->getType()->isIntegerTy() && "Unexpected operand type!");
	} while (true);
	}

	/// This is a wrapper around GetUnderlyingObjects and adds support for basic
	/// ptrtoint+arithmetic+inttoptr sequences.
	void llvm::getUnderlyingObjectsForCodeGen(const Value *V,
	SmallVectorImpl<Value *> &Objects,
	const DataLayout &DL) {
	SmallPtrSet<const Value *, 16> Visited;
	SmallVector<const Value *, 4> Working(1, V);
	do {
	V = Working.pop_back_val();

	SmallVector<Value *, 4> Objs;
	GetUnderlyingObjects(const_cast<Value *>(V), Objs, DL);

	for (Value *V : Objs) {
	if (!Visited.insert(V).second)
	continue;
	if (Operator::getOpcode(V) == Instruction::IntToPtr) {
	const Value *O =
	getUnderlyingObjectFromInt(cast<User>(V)->getOperand(0));
	if (O->getType()->isPointerTy()) {
	Working.push_back(O);
	continue;
	}
	}
	// If GetUnderlyingObjects fails to find an identifiable object,
	// getUnderlyingObjectsForCodeGen also fails for safety.
	if (!isIdentifiedObject(V)) {
	Objects.clear();
	return;
	}
	Objects.push_back(const_cast<Value *>(V));
	}
	} while (!Working.empty());
	}

	/// Return true if the only users of this pointer are lifetime markers.
	bool llvm::onlyUsedByLifetimeMarkers(const Value *V) {
	for (const User *U : V->users()) {
	const IntrinsicInst *II = dyn_cast<IntrinsicInst>(U);
	if (!II) return false;

	if (II->getIntrinsicID() != Intrinsic::lifetime_start &&
	II->getIntrinsicID() != Intrinsic::lifetime_end)
	return false;
	}
	return true;
	}

	bool llvm::isSafeToSpeculativelyExecute(const Value *V,
	const Instruction *CtxI,
	const DominatorTree *DT) {
	const Operator *Inst = dyn_cast<Operator>(V);
	if (!Inst)
	return false;

	for (unsigned i = 0, e = Inst->getNumOperands(); i != e; ++i)
	if (Constant *C = dyn_cast<Constant>(Inst->getOperand(i)))
	if (C->canTrap())
	return false;

	switch (Inst->getOpcode()) {
	default:
	return true;
	case Instruction::UDiv:
	case Instruction::URem: {
	// x / y is undefined if y == 0.
	const APInt *V;
	if (match(Inst->getOperand(1), m_APInt(V)))
	return *V != 0;
	return false;
	}
	case Instruction::SDiv:
	case Instruction::SRem: {
	// x / y is undefined if y == 0 or x == INT_MIN and y == -1
	const APInt Numerator, Denominator;
	if (!match(Inst->getOperand(1), m_APInt(Denominator)))
	return false;
	// We cannot hoist this division if the denominator is 0.
	if (*Denominator == 0)
	return false;
	// It's safe to hoist if the denominator is not 0 or -1.
	if (*Denominator != -1)
	return true;
	// At this point we know that the denominator is -1. It is safe to hoist as
	// long we know that the numerator is not INT_MIN.
	if (match(Inst->getOperand(0), m_APInt(Numerator)))
	return !Numerator->isMinSignedValue();
	// The numerator might be MinSignedValue.
	return false;
	}
	case Instruction::Load: {
	const LoadInst *LI = cast<LoadInst>(Inst);
	if (!LI->isUnordered() \|\|
	// Speculative load may create a race that did not exist in the source.
	LI->getFunction()->hasFnAttribute(Attribute::SanitizeThread) \|\|
	// Speculative load may load data from dirty regions.
	LI->getFunction()->hasFnAttribute(Attribute::SanitizeAddress))
	return false;
	const DataLayout &DL = LI->getModule()->getDataLayout();
	return isDereferenceableAndAlignedPointer(LI->getPointerOperand(),
	LI->getAlignment(), DL, CtxI, DT);
	}
	case Instruction::Call: {
	auto *CI = cast<const CallInst>(Inst);
	const Function *Callee = CI->getCalledFunction();

	// The called function could have undefined behavior or side-effects, even
	// if marked readnone nounwind.
	return Callee && Callee->isSpeculatable();
	}
	case Instruction::VAArg:
	case Instruction::Alloca:
	case Instruction::Invoke:
	case Instruction::PHI:
	case Instruction::Store:
	case Instruction::Ret:
	case Instruction::Br:
	case Instruction::IndirectBr:
	case Instruction::Switch:
	case Instruction::Unreachable:
	case Instruction::Fence:
	case Instruction::AtomicRMW:
	case Instruction::AtomicCmpXchg:
	case Instruction::LandingPad:
	case Instruction::Resume:
	case Instruction::CatchSwitch:
	case Instruction::CatchPad:
	case Instruction::CatchRet:
	case Instruction::CleanupPad:
	case Instruction::CleanupRet:
	return false; // Misc instructions which have effects
	}
	}

	bool llvm::mayBeMemoryDependent(const Instruction &I) {
	return I.mayReadOrWriteMemory() \|\| !isSafeToSpeculativelyExecute(&I);
	}

	/// Return true if we know that the specified value is never null.
	bool llvm::isKnownNonNull(const Value *V) {
	assert(V->getType()->isPointerTy() && "V must be pointer type");

	// Alloca never returns null, malloc might.
	if (isa<AllocaInst>(V)) return true;

	// A byval, inalloca, or nonnull argument is never null.
	if (const Argument *A = dyn_cast<Argument>(V))
	return A->hasByValOrInAllocaAttr() \|\| A->hasNonNullAttr();

	// A global variable in address space 0 is non null unless extern weak
	// or an absolute symbol reference. Other address spaces may have null as a
	// valid address for a global, so we can't assume anything.
	if (const GlobalValue *GV = dyn_cast<GlobalValue>(V))
	return !GV->isAbsoluteSymbolRef() && !GV->hasExternalWeakLinkage() &&
	GV->getType()->getAddressSpace() == 0;

	// A Load tagged with nonnull metadata is never null.
	if (const LoadInst *LI = dyn_cast<LoadInst>(V))
	return LI->getMetadata(LLVMContext::MD_nonnull);

	if (auto CS = ImmutableCallSite(V))
	if (CS.isReturnNonNull())
	return true;

	return false;
	}

	static bool isKnownNonNullFromDominatingCondition(const Value *V,
	const Instruction *CtxI,
	const DominatorTree *DT) {
	assert(V->getType()->isPointerTy() && "V must be pointer type");
	assert(!isa<ConstantData>(V) && "Did not expect ConstantPointerNull");
	assert(CtxI && "Context instruction required for analysis");
	assert(DT && "Dominator tree required for analysis");

	unsigned NumUsesExplored = 0;
	for (auto *U : V->users()) {
	// Avoid massive lists
	if (NumUsesExplored >= DomConditionsMaxUses)
	break;
	NumUsesExplored++;

	// If the value is used as an argument to a call or invoke, then argument
	// attributes may provide an answer about null-ness.
	if (auto CS = ImmutableCallSite(U))
	if (auto *CalledFunc = CS.getCalledFunction())
	for (const Argument &Arg : CalledFunc->args())
	if (CS.getArgOperand(Arg.getArgNo()) == V &&
	Arg.hasNonNullAttr() && DT->dominates(CS.getInstruction(), CtxI))
	return true;

	// Consider only compare instructions uniquely controlling a branch
	CmpInst::Predicate Pred;
	if (!match(const_cast<User *>(U),
	m_c_ICmp(Pred, m_Specific(V), m_Zero())) \|\|
	(Pred != ICmpInst::ICMP_EQ && Pred != ICmpInst::ICMP_NE))
	continue;

	for (auto *CmpU : U->users()) {
	if (const BranchInst *BI = dyn_cast<BranchInst>(CmpU)) {
	assert(BI->isConditional() && "uses a comparison!");

	BasicBlock *NonNullSuccessor =
	BI->getSuccessor(Pred == ICmpInst::ICMP_EQ ? 1 : 0);
	BasicBlockEdge Edge(BI->getParent(), NonNullSuccessor);
	if (Edge.isSingleEdge() && DT->dominates(Edge, CtxI->getParent()))
	return true;
	} else if (Pred == ICmpInst::ICMP_NE &&
	match(CmpU, m_Intrinsic<Intrinsic::experimental_guard>()) &&
	DT->dominates(cast<Instruction>(CmpU), CtxI)) {
	return true;
	}
	}
	}

	return false;
	}

	bool llvm::isKnownNonNullAt(const Value V, const Instruction CtxI,
	const DominatorTree *DT) {
	if (isa<ConstantPointerNull>(V) \|\| isa<UndefValue>(V))
	return false;

	if (isKnownNonNull(V))
	return true;

	if (!CtxI \|\| !DT)
	return false;

	return ::isKnownNonNullFromDominatingCondition(V, CtxI, DT);
	}

	OverflowResult llvm::computeOverflowForUnsignedMul(const Value *LHS,
	const Value *RHS,
	const DataLayout &DL,
	AssumptionCache *AC,
	const Instruction *CxtI,
	const DominatorTree *DT) {
	// Multiplying n * m significant bits yields a result of n + m significant
	// bits. If the total number of significant bits does not exceed the
	// result bit width (minus 1), there is no overflow.
	// This means if we have enough leading zero bits in the operands
	// we can guarantee that the result does not overflow.
	// Ref: "Hacker's Delight" by Henry Warren
	unsigned BitWidth = LHS->getType()->getScalarSizeInBits();
	KnownBits LHSKnown(BitWidth);
	KnownBits RHSKnown(BitWidth);
	computeKnownBits(LHS, LHSKnown, DL, /Depth=/0, AC, CxtI, DT);
	computeKnownBits(RHS, RHSKnown, DL, /Depth=/0, AC, CxtI, DT);
	// Note that underestimating the number of zero bits gives a more
	// conservative answer.
	unsigned ZeroBits = LHSKnown.countMinLeadingZeros() +
	RHSKnown.countMinLeadingZeros();
	// First handle the easy case: if we have enough zero bits there's
	// definitely no overflow.
	if (ZeroBits >= BitWidth)
	return OverflowResult::NeverOverflows;

	// Get the largest possible values for each operand.
	APInt LHSMax = ~LHSKnown.Zero;
	APInt RHSMax = ~RHSKnown.Zero;

	// We know the multiply operation doesn't overflow if the maximum values for
	// each operand will not overflow after we multiply them together.
	bool MaxOverflow;
	(void)LHSMax.umul_ov(RHSMax, MaxOverflow);
	if (!MaxOverflow)
	return OverflowResult::NeverOverflows;

	// We know it always overflows if multiplying the smallest possible values for
	// the operands also results in overflow.
	bool MinOverflow;
	(void)LHSKnown.One.umul_ov(RHSKnown.One, MinOverflow);
	if (MinOverflow)
	return OverflowResult::AlwaysOverflows;

	return OverflowResult::MayOverflow;
	}

	OverflowResult llvm::computeOverflowForUnsignedAdd(const Value *LHS,
	const Value *RHS,
	const DataLayout &DL,
	AssumptionCache *AC,
	const Instruction *CxtI,
	const DominatorTree *DT) {
	KnownBits LHSKnown = computeKnownBits(LHS, DL, /Depth=/0, AC, CxtI, DT);
	if (LHSKnown.isNonNegative() \|\| LHSKnown.isNegative()) {
	KnownBits RHSKnown = computeKnownBits(RHS, DL, /Depth=/0, AC, CxtI, DT);

	if (LHSKnown.isNegative() && RHSKnown.isNegative()) {
	// The sign bit is set in both cases: this MUST overflow.
	// Create a simple add instruction, and insert it into the struct.
	return OverflowResult::AlwaysOverflows;
	}

	if (LHSKnown.isNonNegative() && RHSKnown.isNonNegative()) {
	// The sign bit is clear in both cases: this CANNOT overflow.
	// Create a simple add instruction, and insert it into the struct.
	return OverflowResult::NeverOverflows;
	}
	}

	return OverflowResult::MayOverflow;
	}

	/// \brief Return true if we can prove that adding the two values of the
	/// knownbits will not overflow.
	/// Otherwise return false.
	static bool checkRippleForSignedAdd(const KnownBits &LHSKnown,
	const KnownBits &RHSKnown) {
	// Addition of two 2's complement numbers having opposite signs will never
	// overflow.
	if ((LHSKnown.isNegative() && RHSKnown.isNonNegative()) \|\|
	(LHSKnown.isNonNegative() && RHSKnown.isNegative()))
	return true;

	// If either of the values is known to be non-negative, adding them can only
	// overflow if the second is also non-negative, so we can assume that.
	// Two non-negative numbers will only overflow if there is a carry to the
	// sign bit, so we can check if even when the values are as big as possible
	// there is no overflow to the sign bit.
	if (LHSKnown.isNonNegative() \|\| RHSKnown.isNonNegative()) {
	APInt MaxLHS = ~LHSKnown.Zero;
	MaxLHS.clearSignBit();
	APInt MaxRHS = ~RHSKnown.Zero;
	MaxRHS.clearSignBit();
	APInt Result = std::move(MaxLHS) + std::move(MaxRHS);
	return Result.isSignBitClear();
	}

	// If either of the values is known to be negative, adding them can only
	// overflow if the second is also negative, so we can assume that.
	// Two negative number will only overflow if there is no carry to the sign
	// bit, so we can check if even when the values are as small as possible
	// there is overflow to the sign bit.
	if (LHSKnown.isNegative() \|\| RHSKnown.isNegative()) {
	APInt MinLHS = LHSKnown.One;
	MinLHS.clearSignBit();
	APInt MinRHS = RHSKnown.One;
	MinRHS.clearSignBit();
	APInt Result = std::move(MinLHS) + std::move(MinRHS);
	return Result.isSignBitSet();
	}

	// If we reached here it means that we know nothing about the sign bits.
	// In this case we can't know if there will be an overflow, since by
	// changing the sign bits any two values can be made to overflow.
	return false;
	}

	static OverflowResult computeOverflowForSignedAdd(const Value *LHS,
	const Value *RHS,
	const AddOperator *Add,
	const DataLayout &DL,
	AssumptionCache *AC,
	const Instruction *CxtI,
	const DominatorTree *DT) {
	if (Add && Add->hasNoSignedWrap()) {
	return OverflowResult::NeverOverflows;
	}

	// If LHS and RHS each have at least two sign bits, the addition will look
	// like
	//
	// XX..... +
	// YY.....
	//
	// If the carry into the most significant position is 0, X and Y can't both
	// be 1 and therefore the carry out of the addition is also 0.
	//
	// If the carry into the most significant position is 1, X and Y can't both
	// be 0 and therefore the carry out of the addition is also 1.
	//
	// Since the carry into the most significant position is always equal to
	// the carry out of the addition, there is no signed overflow.
	if (ComputeNumSignBits(LHS, DL, 0, AC, CxtI, DT) > 1 &&
	ComputeNumSignBits(RHS, DL, 0, AC, CxtI, DT) > 1)
	return OverflowResult::NeverOverflows;

	KnownBits LHSKnown = computeKnownBits(LHS, DL, /Depth=/0, AC, CxtI, DT);
	KnownBits RHSKnown = computeKnownBits(RHS, DL, /Depth=/0, AC, CxtI, DT);

	if (checkRippleForSignedAdd(LHSKnown, RHSKnown))
	return OverflowResult::NeverOverflows;

	// The remaining code needs Add to be available. Early returns if not so.
	if (!Add)
	return OverflowResult::MayOverflow;

	// If the sign of Add is the same as at least one of the operands, this add
	// CANNOT overflow. This is particularly useful when the sum is
	// @llvm.assume'ed non-negative rather than proved so from analyzing its
	// operands.
	bool LHSOrRHSKnownNonNegative =
	(LHSKnown.isNonNegative() \|\| RHSKnown.isNonNegative());
	bool LHSOrRHSKnownNegative =
	(LHSKnown.isNegative() \|\| RHSKnown.isNegative());
	if (LHSOrRHSKnownNonNegative \|\| LHSOrRHSKnownNegative) {
	KnownBits AddKnown = computeKnownBits(Add, DL, /Depth=/0, AC, CxtI, DT);
	if ((AddKnown.isNonNegative() && LHSOrRHSKnownNonNegative) \|\|
	(AddKnown.isNegative() && LHSOrRHSKnownNegative)) {
	return OverflowResult::NeverOverflows;
	}
	}

	return OverflowResult::MayOverflow;
	}

	bool llvm::isOverflowIntrinsicNoWrap(const IntrinsicInst *II,
	const DominatorTree &DT) {
	#ifndef NDEBUG
	auto IID = II->getIntrinsicID();
	assert((IID == Intrinsic::sadd_with_overflow \|\|
	IID == Intrinsic::uadd_with_overflow \|\|
	IID == Intrinsic::ssub_with_overflow \|\|
	IID == Intrinsic::usub_with_overflow \|\|
	IID == Intrinsic::smul_with_overflow \|\|
	IID == Intrinsic::umul_with_overflow) &&
	"Not an overflow intrinsic!");
	#endif

	SmallVector<const BranchInst *, 2> GuardingBranches;
	SmallVector<const ExtractValueInst *, 2> Results;

	for (const User *U : II->users()) {
	if (const auto *EVI = dyn_cast<ExtractValueInst>(U)) {
	assert(EVI->getNumIndices() == 1 && "Obvious from CI's type");

	if (EVI->getIndices()[0] == 0)
	Results.push_back(EVI);
	else {
	assert(EVI->getIndices()[0] == 1 && "Obvious from CI's type");

	for (const auto *U : EVI->users())
	if (const auto *B = dyn_cast<BranchInst>(U)) {
	assert(B->isConditional() && "How else is it using an i1?");
	GuardingBranches.push_back(B);
	}
	}
	} else {
	// We are using the aggregate directly in a way we don't want to analyze
	// here (storing it to a global, say).
	return false;
	}
	}

	auto AllUsesGuardedByBranch = [&](const BranchInst *BI) {
	BasicBlockEdge NoWrapEdge(BI->getParent(), BI->getSuccessor(1));
	if (!NoWrapEdge.isSingleEdge())
	return false;

	// Check if all users of the add are provably no-wrap.
	for (const auto *Result : Results) {
	// If the extractvalue itself is not executed on overflow, the we don't
	// need to check each use separately, since domination is transitive.
	if (DT.dominates(NoWrapEdge, Result->getParent()))
	continue;

	for (auto &RU : Result->uses())
	if (!DT.dominates(NoWrapEdge, RU))
	return false;
	}

	return true;
	};

	return any_of(GuardingBranches, AllUsesGuardedByBranch);
	}


	OverflowResult llvm::computeOverflowForSignedAdd(const AddOperator *Add,
	const DataLayout &DL,
	AssumptionCache *AC,
	const Instruction *CxtI,
	const DominatorTree *DT) {
	return ::computeOverflowForSignedAdd(Add->getOperand(0), Add->getOperand(1),
	Add, DL, AC, CxtI, DT);
	}

	OverflowResult llvm::computeOverflowForSignedAdd(const Value *LHS,
	const Value *RHS,
	const DataLayout &DL,
	AssumptionCache *AC,
	const Instruction *CxtI,
	const DominatorTree *DT) {
	return ::computeOverflowForSignedAdd(LHS, RHS, nullptr, DL, AC, CxtI, DT);
	}

	bool llvm::isGuaranteedToTransferExecutionToSuccessor(const Instruction *I) {
	// A memory operation returns normally if it isn't volatile. A volatile
	// operation is allowed to trap.
	//
	// An atomic operation isn't guaranteed to return in a reasonable amount of
	// time because it's possible for another thread to interfere with it for an
	// arbitrary length of time, but programs aren't allowed to rely on that.
	if (const LoadInst *LI = dyn_cast<LoadInst>(I))
	return !LI->isVolatile();
	if (const StoreInst *SI = dyn_cast<StoreInst>(I))
	return !SI->isVolatile();
	if (const AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(I))
	return !CXI->isVolatile();
	if (const AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(I))
	return !RMWI->isVolatile();
	if (const MemIntrinsic *MII = dyn_cast<MemIntrinsic>(I))
	return !MII->isVolatile();

	// If there is no successor, then execution can't transfer to it.
	if (const auto *CRI = dyn_cast<CleanupReturnInst>(I))
	return !CRI->unwindsToCaller();
	if (const auto *CatchSwitch = dyn_cast<CatchSwitchInst>(I))
	return !CatchSwitch->unwindsToCaller();
	if (isa<ResumeInst>(I))
	return false;
	if (isa<ReturnInst>(I))
	return false;
	if (isa<UnreachableInst>(I))
	return false;

	// Calls can throw, or contain an infinite loop, or kill the process.
	if (auto CS = ImmutableCallSite(I)) {
	// Call sites that throw have implicit non-local control flow.
	if (!CS.doesNotThrow())
	return false;

	// Non-throwing call sites can loop infinitely, call exit/pthread_exit
	// etc. and thus not return. However, LLVM already assumes that
	//
	// - Thread exiting actions are modeled as writes to memory invisible to
	// the program.
	//
	// - Loops that don't have side effects (side effects are volatile/atomic
	// stores and IO) always terminate (see http://llvm.org/PR965).
	// Furthermore IO itself is also modeled as writes to memory invisible to
	// the program.
	//
	// We rely on those assumptions here, and use the memory effects of the call
	// target as a proxy for checking that it always returns.

	// FIXME: This isn't aggressive enough; a call which only writes to a global
	// is guaranteed to return.
	return CS.onlyReadsMemory() \|\| CS.onlyAccessesArgMemory() \|\|
	match(I, m_Intrinsic<Intrinsic::assume>());
	}

	// Other instructions return normally.
	return true;
	}

	bool llvm::isGuaranteedToExecuteForEveryIteration(const Instruction *I,
	const Loop *L) {
	// The loop header is guaranteed to be executed for every iteration.
	//
	// FIXME: Relax this constraint to cover all basic blocks that are
	// guaranteed to be executed at every iteration.
	if (I->getParent() != L->getHeader()) return false;

	for (const Instruction &LI : *L->getHeader()) {
	if (&LI == I) return true;
	if (!isGuaranteedToTransferExecutionToSuccessor(&LI)) return false;
	}
	llvm_unreachable("Instruction not contained in its own parent basic block.");
	}

	bool llvm::propagatesFullPoison(const Instruction *I) {
	switch (I->getOpcode()) {
	case Instruction::Add:
	case Instruction::Sub:
	case Instruction::Xor:
	case Instruction::Trunc:
	case Instruction::BitCast:
	case Instruction::AddrSpaceCast:
	case Instruction::Mul:
	case Instruction::Shl:
	case Instruction::GetElementPtr:
	// These operations all propagate poison unconditionally. Note that poison
	// is not any particular value, so xor or subtraction of poison with
	// itself still yields poison, not zero.
	return true;

	case Instruction::AShr:
	case Instruction::SExt:
	// For these operations, one bit of the input is replicated across
	// multiple output bits. A replicated poison bit is still poison.
	return true;

	case Instruction::ICmp:
	// Comparing poison with any value yields poison. This is why, for
	// instance, x s< (x +nsw 1) can be folded to true.
	return true;

	default:
	return false;
	}
	}

	const Value llvm::getGuaranteedNonFullPoisonOp(const Instruction I) {
	switch (I->getOpcode()) {
	case Instruction::Store:
	return cast<StoreInst>(I)->getPointerOperand();

	case Instruction::Load:
	return cast<LoadInst>(I)->getPointerOperand();

	case Instruction::AtomicCmpXchg:
	return cast<AtomicCmpXchgInst>(I)->getPointerOperand();

	case Instruction::AtomicRMW:
	return cast<AtomicRMWInst>(I)->getPointerOperand();

	case Instruction::UDiv:
	case Instruction::SDiv:
	case Instruction::URem:
	case Instruction::SRem:
	return I->getOperand(1);

	default:
	return nullptr;
	}
	}

	bool llvm::programUndefinedIfFullPoison(const Instruction *PoisonI) {
	// We currently only look for uses of poison values within the same basic
	// block, as that makes it easier to guarantee that the uses will be
	// executed given that PoisonI is executed.
	//
	// FIXME: Expand this to consider uses beyond the same basic block. To do
	// this, look out for the distinction between post-dominance and strong
	// post-dominance.
	const BasicBlock *BB = PoisonI->getParent();

	// Set of instructions that we have proved will yield poison if PoisonI
	// does.
	SmallSet<const Value *, 16> YieldsPoison;
	SmallSet<const BasicBlock *, 4> Visited;
	YieldsPoison.insert(PoisonI);
	Visited.insert(PoisonI->getParent());

	BasicBlock::const_iterator Begin = PoisonI->getIterator(), End = BB->end();

	unsigned Iter = 0;
	while (Iter++ < MaxDepth) {
	for (auto &I : make_range(Begin, End)) {
	if (&I != PoisonI) {
	const Value *NotPoison = getGuaranteedNonFullPoisonOp(&I);
	if (NotPoison != nullptr && YieldsPoison.count(NotPoison))
	return true;
	if (!isGuaranteedToTransferExecutionToSuccessor(&I))
	return false;
	}

	// Mark poison that propagates from I through uses of I.
	if (YieldsPoison.count(&I)) {
	for (const User *User : I.users()) {
	const Instruction *UserI = cast<Instruction>(User);
	if (propagatesFullPoison(UserI))
	YieldsPoison.insert(User);
	}
	}
	}

	if (auto *NextBB = BB->getSingleSuccessor()) {
	if (Visited.insert(NextBB).second) {
	BB = NextBB;
	Begin = BB->getFirstNonPHI()->getIterator();
	End = BB->end();
	continue;
	}
	}

	break;
	};
	return false;
	}

	static bool isKnownNonNaN(const Value *V, FastMathFlags FMF) {
	if (FMF.noNaNs())
	return true;

	if (auto *C = dyn_cast<ConstantFP>(V))
	return !C->isNaN();
	return false;
	}

	static bool isKnownNonZero(const Value *V) {
	if (auto *C = dyn_cast<ConstantFP>(V))
	return !C->isZero();
	return false;
	}

	/// Match non-obvious integer minimum and maximum sequences.
	static SelectPatternResult matchMinMax(CmpInst::Predicate Pred,
	Value CmpLHS, Value CmpRHS,
	Value TrueVal, Value FalseVal,
	Value &LHS, Value &RHS) {
	// Assume success. If there's no match, callers should not use these anyway.
	LHS = TrueVal;
	RHS = FalseVal;

	// Recognize variations of:
	// CLAMP(v,l,h) ==> ((v) < (l) ? (l) : ((v) > (h) ? (h) : (v)))
	const APInt *C1;
	if (CmpRHS == TrueVal && match(CmpRHS, m_APInt(C1))) {
	const APInt *C2;

	// (X <s C1) ? C1 : SMIN(X, C2) ==> SMAX(SMIN(X, C2), C1)
	if (match(FalseVal, m_SMin(m_Specific(CmpLHS), m_APInt(C2))) &&
	C1->slt(*C2) && Pred == CmpInst::ICMP_SLT)
	return {SPF_SMAX, SPNB_NA, false};

	// (X >s C1) ? C1 : SMAX(X, C2) ==> SMIN(SMAX(X, C2), C1)
	if (match(FalseVal, m_SMax(m_Specific(CmpLHS), m_APInt(C2))) &&
	C1->sgt(*C2) && Pred == CmpInst::ICMP_SGT)
	return {SPF_SMIN, SPNB_NA, false};

	// (X <u C1) ? C1 : UMIN(X, C2) ==> UMAX(UMIN(X, C2), C1)
	if (match(FalseVal, m_UMin(m_Specific(CmpLHS), m_APInt(C2))) &&
	C1->ult(*C2) && Pred == CmpInst::ICMP_ULT)
	return {SPF_UMAX, SPNB_NA, false};

	// (X >u C1) ? C1 : UMAX(X, C2) ==> UMIN(UMAX(X, C2), C1)
	if (match(FalseVal, m_UMax(m_Specific(CmpLHS), m_APInt(C2))) &&
	C1->ugt(*C2) && Pred == CmpInst::ICMP_UGT)
	return {SPF_UMIN, SPNB_NA, false};
	}

	if (Pred != CmpInst::ICMP_SGT && Pred != CmpInst::ICMP_SLT)
	return {SPF_UNKNOWN, SPNB_NA, false};

	// Z = X -nsw Y
	// (X >s Y) ? 0 : Z ==> (Z >s 0) ? 0 : Z ==> SMIN(Z, 0)
	// (X <s Y) ? 0 : Z ==> (Z <s 0) ? 0 : Z ==> SMAX(Z, 0)
	if (match(TrueVal, m_Zero()) &&
	match(FalseVal, m_NSWSub(m_Specific(CmpLHS), m_Specific(CmpRHS))))
	return {Pred == CmpInst::ICMP_SGT ? SPF_SMIN : SPF_SMAX, SPNB_NA, false};

	// Z = X -nsw Y
	// (X >s Y) ? Z : 0 ==> (Z >s 0) ? Z : 0 ==> SMAX(Z, 0)
	// (X <s Y) ? Z : 0 ==> (Z <s 0) ? Z : 0 ==> SMIN(Z, 0)
	if (match(FalseVal, m_Zero()) &&
	match(TrueVal, m_NSWSub(m_Specific(CmpLHS), m_Specific(CmpRHS))))
	return {Pred == CmpInst::ICMP_SGT ? SPF_SMAX : SPF_SMIN, SPNB_NA, false};

	if (!match(CmpRHS, m_APInt(C1)))
	return {SPF_UNKNOWN, SPNB_NA, false};

	// An unsigned min/max can be written with a signed compare.
	const APInt *C2;
	if ((CmpLHS == TrueVal && match(FalseVal, m_APInt(C2))) \|\|
	(CmpLHS == FalseVal && match(TrueVal, m_APInt(C2)))) {
	// Is the sign bit set?
	// (X <s 0) ? X : MAXVAL ==> (X >u MAXVAL) ? X : MAXVAL ==> UMAX
	// (X <s 0) ? MAXVAL : X ==> (X >u MAXVAL) ? MAXVAL : X ==> UMIN
	if (Pred == CmpInst::ICMP_SLT && *C1 == 0 && C2->isMaxSignedValue())
	return {CmpLHS == TrueVal ? SPF_UMAX : SPF_UMIN, SPNB_NA, false};

	// Is the sign bit clear?
	// (X >s -1) ? MINVAL : X ==> (X <u MINVAL) ? MINVAL : X ==> UMAX
	// (X >s -1) ? X : MINVAL ==> (X <u MINVAL) ? X : MINVAL ==> UMIN
	if (Pred == CmpInst::ICMP_SGT && C1->isAllOnesValue() &&
	C2->isMinSignedValue())
	return {CmpLHS == FalseVal ? SPF_UMAX : SPF_UMIN, SPNB_NA, false};
	}

	// Look through 'not' ops to find disguised signed min/max.
	// (X >s C) ? ~X : ~C ==> (~X <s ~C) ? ~X : ~C ==> SMIN(~X, ~C)
	// (X <s C) ? ~X : ~C ==> (~X >s ~C) ? ~X : ~C ==> SMAX(~X, ~C)
	if (match(TrueVal, m_Not(m_Specific(CmpLHS))) &&
	match(FalseVal, m_APInt(C2)) && ~(C1) == C2)
	return {Pred == CmpInst::ICMP_SGT ? SPF_SMIN : SPF_SMAX, SPNB_NA, false};

	// (X >s C) ? ~C : ~X ==> (~X <s ~C) ? ~C : ~X ==> SMAX(~C, ~X)
	// (X <s C) ? ~C : ~X ==> (~X >s ~C) ? ~C : ~X ==> SMIN(~C, ~X)
	if (match(FalseVal, m_Not(m_Specific(CmpLHS))) &&
	match(TrueVal, m_APInt(C2)) && ~(C1) == C2)
	return {Pred == CmpInst::ICMP_SGT ? SPF_SMAX : SPF_SMIN, SPNB_NA, false};

	return {SPF_UNKNOWN, SPNB_NA, false};
	}

	static SelectPatternResult matchSelectPattern(CmpInst::Predicate Pred,
	FastMathFlags FMF,
	Value CmpLHS, Value CmpRHS,
	Value TrueVal, Value FalseVal,
	Value &LHS, Value &RHS) {
	LHS = CmpLHS;
	RHS = CmpRHS;

	// If the predicate is an "or-equal" (FP) predicate, then signed zeroes may
	// return inconsistent results between implementations.
	// (0.0 <= -0.0) ? 0.0 : -0.0 // Returns 0.0
	// minNum(0.0, -0.0) // May return -0.0 or 0.0 (IEEE 754-2008 5.3.1)
	// Therefore we behave conservatively and only proceed if at least one of the
	// operands is known to not be zero, or if we don't care about signed zeroes.
	switch (Pred) {
	default: break;
	case CmpInst::FCMP_OGE: case CmpInst::FCMP_OLE:
	case CmpInst::FCMP_UGE: case CmpInst::FCMP_ULE:
	if (!FMF.noSignedZeros() && !isKnownNonZero(CmpLHS) &&
	!isKnownNonZero(CmpRHS))
	return {SPF_UNKNOWN, SPNB_NA, false};
	}

	SelectPatternNaNBehavior NaNBehavior = SPNB_NA;
	bool Ordered = false;

	// When given one NaN and one non-NaN input:
	// - maxnum/minnum (C99 fmaxf()/fminf()) return the non-NaN input.
	// - A simple C99 (a < b ? a : b) construction will return 'b' (as the
	// ordered comparison fails), which could be NaN or non-NaN.
	// so here we discover exactly what NaN behavior is required/accepted.
	if (CmpInst::isFPPredicate(Pred)) {
	bool LHSSafe = isKnownNonNaN(CmpLHS, FMF);
	bool RHSSafe = isKnownNonNaN(CmpRHS, FMF);

	if (LHSSafe && RHSSafe) {
	// Both operands are known non-NaN.
	NaNBehavior = SPNB_RETURNS_ANY;
	} else if (CmpInst::isOrdered(Pred)) {
	// An ordered comparison will return false when given a NaN, so it
	// returns the RHS.
	Ordered = true;
	if (LHSSafe)
	// LHS is non-NaN, so if RHS is NaN then NaN will be returned.
	NaNBehavior = SPNB_RETURNS_NAN;
	else if (RHSSafe)
	NaNBehavior = SPNB_RETURNS_OTHER;
	else
	// Completely unsafe.
	return {SPF_UNKNOWN, SPNB_NA, false};
	} else {
	Ordered = false;
	// An unordered comparison will return true when given a NaN, so it
	// returns the LHS.
	if (LHSSafe)
	// LHS is non-NaN, so if RHS is NaN then non-NaN will be returned.
	NaNBehavior = SPNB_RETURNS_OTHER;
	else if (RHSSafe)
	NaNBehavior = SPNB_RETURNS_NAN;
	else
	// Completely unsafe.
	return {SPF_UNKNOWN, SPNB_NA, false};
	}
	}

	if (TrueVal == CmpRHS && FalseVal == CmpLHS) {
	std::swap(CmpLHS, CmpRHS);
	Pred = CmpInst::getSwappedPredicate(Pred);
	if (NaNBehavior == SPNB_RETURNS_NAN)
	NaNBehavior = SPNB_RETURNS_OTHER;
	else if (NaNBehavior == SPNB_RETURNS_OTHER)
	NaNBehavior = SPNB_RETURNS_NAN;
	Ordered = !Ordered;
	}

	// ([if]cmp X, Y) ? X : Y
	if (TrueVal == CmpLHS && FalseVal == CmpRHS) {
	switch (Pred) {
	default: return {SPF_UNKNOWN, SPNB_NA, false}; // Equality.
	case ICmpInst::ICMP_UGT:
	case ICmpInst::ICMP_UGE: return {SPF_UMAX, SPNB_NA, false};
	case ICmpInst::ICMP_SGT:
	case ICmpInst::ICMP_SGE: return {SPF_SMAX, SPNB_NA, false};
	case ICmpInst::ICMP_ULT:
	case ICmpInst::ICMP_ULE: return {SPF_UMIN, SPNB_NA, false};
	case ICmpInst::ICMP_SLT:
	case ICmpInst::ICMP_SLE: return {SPF_SMIN, SPNB_NA, false};
	case FCmpInst::FCMP_UGT:
	case FCmpInst::FCMP_UGE:
	case FCmpInst::FCMP_OGT:
	case FCmpInst::FCMP_OGE: return {SPF_FMAXNUM, NaNBehavior, Ordered};
	case FCmpInst::FCMP_ULT:
	case FCmpInst::FCMP_ULE:
	case FCmpInst::FCMP_OLT:
	case FCmpInst::FCMP_OLE: return {SPF_FMINNUM, NaNBehavior, Ordered};
	}
	}

	const APInt *C1;
	if (match(CmpRHS, m_APInt(C1))) {
	if ((CmpLHS == TrueVal && match(FalseVal, m_Neg(m_Specific(CmpLHS)))) \|\|
	(CmpLHS == FalseVal && match(TrueVal, m_Neg(m_Specific(CmpLHS))))) {

	// ABS(X) ==> (X >s 0) ? X : -X and (X >s -1) ? X : -X
	// NABS(X) ==> (X >s 0) ? -X : X and (X >s -1) ? -X : X
	if (Pred == ICmpInst::ICMP_SGT && (*C1 == 0 \|\| C1->isAllOnesValue())) {
	return {(CmpLHS == TrueVal) ? SPF_ABS : SPF_NABS, SPNB_NA, false};
	}

	// ABS(X) ==> (X <s 0) ? -X : X and (X <s 1) ? -X : X
	// NABS(X) ==> (X <s 0) ? X : -X and (X <s 1) ? X : -X
	if (Pred == ICmpInst::ICMP_SLT && (C1 == 0 \|\| C1 == 1)) {
	return {(CmpLHS == FalseVal) ? SPF_ABS : SPF_NABS, SPNB_NA, false};
	}
	}
	}

	return matchMinMax(Pred, CmpLHS, CmpRHS, TrueVal, FalseVal, LHS, RHS);
	}

	static Value lookThroughCast(CmpInst CmpI, Value V1, Value V2,
	Instruction::CastOps *CastOp) {
	auto *Cast1 = dyn_cast<CastInst>(V1);
	if (!Cast1)
	return nullptr;

	*CastOp = Cast1->getOpcode();
	Type *SrcTy = Cast1->getSrcTy();
	if (auto *Cast2 = dyn_cast<CastInst>(V2)) {
	// If V1 and V2 are both the same cast from the same type, look through V1.
	if (*CastOp == Cast2->getOpcode() && SrcTy == Cast2->getSrcTy())
	return Cast2->getOperand(0);
	return nullptr;
	}

	auto *C = dyn_cast<Constant>(V2);
	if (!C)
	return nullptr;

	Constant *CastedTo = nullptr;
	switch (*CastOp) {
	case Instruction::ZExt:
	if (CmpI->isUnsigned())
	CastedTo = ConstantExpr::getTrunc(C, SrcTy);
	break;
	case Instruction::SExt:
	if (CmpI->isSigned())
	CastedTo = ConstantExpr::getTrunc(C, SrcTy, true);
	break;
	case Instruction::Trunc:
	CastedTo = ConstantExpr::getIntegerCast(C, SrcTy, CmpI->isSigned());
	break;
	case Instruction::FPTrunc:
	CastedTo = ConstantExpr::getFPExtend(C, SrcTy, true);
	break;
	case Instruction::FPExt:
	CastedTo = ConstantExpr::getFPTrunc(C, SrcTy, true);
	break;
	case Instruction::FPToUI:
	CastedTo = ConstantExpr::getUIToFP(C, SrcTy, true);
	break;
	case Instruction::FPToSI:
	CastedTo = ConstantExpr::getSIToFP(C, SrcTy, true);
	break;
	case Instruction::UIToFP:
	CastedTo = ConstantExpr::getFPToUI(C, SrcTy, true);
	break;
	case Instruction::SIToFP:
	CastedTo = ConstantExpr::getFPToSI(C, SrcTy, true);
	break;
	default:
	break;
	}

	if (!CastedTo)
	return nullptr;

	// Make sure the cast doesn't lose any information.
	Constant *CastedBack =
	ConstantExpr::getCast(*CastOp, CastedTo, C->getType(), true);
	if (CastedBack != C)
	return nullptr;

	return CastedTo;
	}

	SelectPatternResult llvm::matchSelectPattern(Value V, Value &LHS, Value *&RHS,
	Instruction::CastOps *CastOp) {
	SelectInst *SI = dyn_cast<SelectInst>(V);
	if (!SI) return {SPF_UNKNOWN, SPNB_NA, false};

	CmpInst *CmpI = dyn_cast<CmpInst>(SI->getCondition());
	if (!CmpI) return {SPF_UNKNOWN, SPNB_NA, false};

	CmpInst::Predicate Pred = CmpI->getPredicate();
	Value *CmpLHS = CmpI->getOperand(0);
	Value *CmpRHS = CmpI->getOperand(1);
	Value *TrueVal = SI->getTrueValue();
	Value *FalseVal = SI->getFalseValue();
	FastMathFlags FMF;
	if (isa<FPMathOperator>(CmpI))
	FMF = CmpI->getFastMathFlags();

	// Bail out early.
	if (CmpI->isEquality())
	return {SPF_UNKNOWN, SPNB_NA, false};

	// Deal with type mismatches.
	if (CastOp && CmpLHS->getType() != TrueVal->getType()) {
	if (Value *C = lookThroughCast(CmpI, TrueVal, FalseVal, CastOp))
	return ::matchSelectPattern(Pred, FMF, CmpLHS, CmpRHS,
	cast<CastInst>(TrueVal)->getOperand(0), C,
	LHS, RHS);
	if (Value *C = lookThroughCast(CmpI, FalseVal, TrueVal, CastOp))
	return ::matchSelectPattern(Pred, FMF, CmpLHS, CmpRHS,
	C, cast<CastInst>(FalseVal)->getOperand(0),
	LHS, RHS);
	}
	return ::matchSelectPattern(Pred, FMF, CmpLHS, CmpRHS, TrueVal, FalseVal,
	LHS, RHS);
	}

	/// Return true if "icmp Pred LHS RHS" is always true.
	static bool isTruePredicate(CmpInst::Predicate Pred,
	const Value LHS, const Value RHS,
	const DataLayout &DL, unsigned Depth,
	AssumptionCache AC, const Instruction CxtI,
	const DominatorTree *DT) {
	assert(!LHS->getType()->isVectorTy() && "TODO: extend to handle vectors!");
	if (ICmpInst::isTrueWhenEqual(Pred) && LHS == RHS)
	return true;

	switch (Pred) {
	default:
	return false;

	case CmpInst::ICMP_SLE: {
	const APInt *C;

	// LHS s<= LHS +_{nsw} C if C >= 0
	if (match(RHS, m_NSWAdd(m_Specific(LHS), m_APInt(C))))
	return !C->isNegative();
	return false;
	}

	case CmpInst::ICMP_ULE: {
	const APInt *C;

	// LHS u<= LHS +_{nuw} C for any C
	if (match(RHS, m_NUWAdd(m_Specific(LHS), m_APInt(C))))
	return true;

	// Match A to (X +_{nuw} CA) and B to (X +_{nuw} CB)
	auto MatchNUWAddsToSameValue = [&](const Value A, const Value B,
	const Value *&X,
	const APInt &CA, const APInt &CB) {
	if (match(A, m_NUWAdd(m_Value(X), m_APInt(CA))) &&
	match(B, m_NUWAdd(m_Specific(X), m_APInt(CB))))
	return true;

	// If X & C == 0 then (X \| C) == X +_{nuw} C
	if (match(A, m_Or(m_Value(X), m_APInt(CA))) &&
	match(B, m_Or(m_Specific(X), m_APInt(CB)))) {
	KnownBits Known(CA->getBitWidth());
	computeKnownBits(X, Known, DL, Depth + 1, AC, CxtI, DT);

	if (CA->isSubsetOf(Known.Zero) && CB->isSubsetOf(Known.Zero))
	return true;
	}

	return false;
	};

	const Value *X;
	const APInt CLHS, CRHS;
	if (MatchNUWAddsToSameValue(LHS, RHS, X, CLHS, CRHS))
	return CLHS->ule(*CRHS);

	return false;
	}
	}
	}

	/// Return true if "icmp Pred BLHS BRHS" is true whenever "icmp Pred
	/// ALHS ARHS" is true. Otherwise, return None.
	static Optional<bool>
	isImpliedCondOperands(CmpInst::Predicate Pred, const Value *ALHS,
	const Value ARHS, const Value BLHS,
	const Value *BRHS, const DataLayout &DL,
	unsigned Depth, AssumptionCache *AC,
	const Instruction CxtI, const DominatorTree DT) {
	switch (Pred) {
	default:
	return None;

	case CmpInst::ICMP_SLT:
	case CmpInst::ICMP_SLE:
	if (isTruePredicate(CmpInst::ICMP_SLE, BLHS, ALHS, DL, Depth, AC, CxtI,
	DT) &&
	isTruePredicate(CmpInst::ICMP_SLE, ARHS, BRHS, DL, Depth, AC, CxtI, DT))
	return true;
	return None;

	case CmpInst::ICMP_ULT:
	case CmpInst::ICMP_ULE:
	if (isTruePredicate(CmpInst::ICMP_ULE, BLHS, ALHS, DL, Depth, AC, CxtI,
	DT) &&
	isTruePredicate(CmpInst::ICMP_ULE, ARHS, BRHS, DL, Depth, AC, CxtI, DT))
	return true;
	return None;
	}
	}

	/// Return true if the operands of the two compares match. IsSwappedOps is true
	/// when the operands match, but are swapped.
	static bool isMatchingOps(const Value ALHS, const Value ARHS,
	const Value BLHS, const Value BRHS,
	bool &IsSwappedOps) {

	bool IsMatchingOps = (ALHS == BLHS && ARHS == BRHS);
	IsSwappedOps = (ALHS == BRHS && ARHS == BLHS);
	return IsMatchingOps \|\| IsSwappedOps;
	}

	/// Return true if "icmp1 APred ALHS ARHS" implies "icmp2 BPred BLHS BRHS" is
	/// true. Return false if "icmp1 APred ALHS ARHS" implies "icmp2 BPred BLHS
	/// BRHS" is false. Otherwise, return None if we can't infer anything.
	static Optional<bool> isImpliedCondMatchingOperands(CmpInst::Predicate APred,
	const Value *ALHS,
	const Value *ARHS,
	CmpInst::Predicate BPred,
	const Value *BLHS,
	const Value *BRHS,
	bool IsSwappedOps) {
	// Canonicalize the operands so they're matching.
	if (IsSwappedOps) {
	std::swap(BLHS, BRHS);
	BPred = ICmpInst::getSwappedPredicate(BPred);
	}
	if (CmpInst::isImpliedTrueByMatchingCmp(APred, BPred))
	return true;
	if (CmpInst::isImpliedFalseByMatchingCmp(APred, BPred))
	return false;

	return None;
	}

	/// Return true if "icmp1 APred ALHS C1" implies "icmp2 BPred BLHS C2" is
	/// true. Return false if "icmp1 APred ALHS C1" implies "icmp2 BPred BLHS
	/// C2" is false. Otherwise, return None if we can't infer anything.
	static Optional<bool>
	isImpliedCondMatchingImmOperands(CmpInst::Predicate APred, const Value *ALHS,
	const ConstantInt *C1,
	CmpInst::Predicate BPred,
	const Value BLHS, const ConstantInt C2) {
	assert(ALHS == BLHS && "LHS operands must match.");
	ConstantRange DomCR =
	ConstantRange::makeExactICmpRegion(APred, C1->getValue());
	ConstantRange CR =
	ConstantRange::makeAllowedICmpRegion(BPred, C2->getValue());
	ConstantRange Intersection = DomCR.intersectWith(CR);
	ConstantRange Difference = DomCR.difference(CR);
	if (Intersection.isEmptySet())
	return false;
	if (Difference.isEmptySet())
	return true;
	return None;
	}

	Optional<bool> llvm::isImpliedCondition(const Value LHS, const Value RHS,
	const DataLayout &DL, bool LHSIsFalse,
	unsigned Depth, AssumptionCache *AC,
	const Instruction *CxtI,
	const DominatorTree *DT) {
	+ // Bail out when we hit the limit.
	+ if (Depth == MaxDepth)
	+ return None;
	+
	// A mismatch occurs when we compare a scalar cmp to a vector cmp, for example.
	if (LHS->getType() != RHS->getType())
	return None;

	Type *OpTy = LHS->getType();
	assert(OpTy->isIntOrIntVectorTy(1));

	// LHS ==> RHS by definition
	if (LHS == RHS)
	return !LHSIsFalse;

	if (OpTy->isVectorTy())
	// TODO: extending the code below to handle vectors
	return None;
	assert(OpTy->isIntegerTy(1) && "implied by above");

	Value BLHS, BRHS;
	ICmpInst::Predicate BPred;
	// We expect the RHS to be an icmp.
	if (!match(RHS, m_ICmp(BPred, m_Value(BLHS), m_Value(BRHS))))
	return None;

	Value ALHS, ARHS;
	ICmpInst::Predicate APred;
	// The LHS can be an 'or', 'and', or 'icmp'.
	if (!match(LHS, m_ICmp(APred, m_Value(ALHS), m_Value(ARHS)))) {
	// The remaining tests are all recursive, so bail out if we hit the limit.
	if (Depth == MaxDepth)
	return None;
	// If the result of an 'or' is false, then we know both legs of the 'or' are
	// false. Similarly, if the result of an 'and' is true, then we know both
	// legs of the 'and' are true.
	if ((LHSIsFalse && match(LHS, m_Or(m_Value(ALHS), m_Value(ARHS)))) \|\|
	(!LHSIsFalse && match(LHS, m_And(m_Value(ALHS), m_Value(ARHS))))) {
	if (Optional<bool> Implication = isImpliedCondition(
	ALHS, RHS, DL, LHSIsFalse, Depth + 1, AC, CxtI, DT))
	return Implication;
	if (Optional<bool> Implication = isImpliedCondition(
	ARHS, RHS, DL, LHSIsFalse, Depth + 1, AC, CxtI, DT))
	return Implication;
	return None;
	}
	return None;
	}
	// All of the below logic assumes both LHS and RHS are icmps.
	assert(isa<ICmpInst>(LHS) && isa<ICmpInst>(RHS) && "Expected icmps.");

	// The rest of the logic assumes the LHS condition is true. If that's not the
	// case, invert the predicate to make it so.
	if (LHSIsFalse)
	APred = CmpInst::getInversePredicate(APred);

	// Can we infer anything when the two compares have matching operands?
	bool IsSwappedOps;
	if (isMatchingOps(ALHS, ARHS, BLHS, BRHS, IsSwappedOps)) {
	if (Optional<bool> Implication = isImpliedCondMatchingOperands(
	APred, ALHS, ARHS, BPred, BLHS, BRHS, IsSwappedOps))
	return Implication;
	// No amount of additional analysis will infer the second condition, so
	// early exit.
	return None;
	}

	// Can we infer anything when the LHS operands match and the RHS operands are
	// constants (not necessarily matching)?
	if (ALHS == BLHS && isa<ConstantInt>(ARHS) && isa<ConstantInt>(BRHS)) {
	if (Optional<bool> Implication = isImpliedCondMatchingImmOperands(
	APred, ALHS, cast<ConstantInt>(ARHS), BPred, BLHS,
	cast<ConstantInt>(BRHS)))
	return Implication;
	// No amount of additional analysis will infer the second condition, so
	// early exit.
	return None;
	}

	if (APred == BPred)
	return isImpliedCondOperands(APred, ALHS, ARHS, BLHS, BRHS, DL, Depth, AC,
	CxtI, DT);

	return None;
	}
	diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
	index 0cad20db0964..ecb54e1e4b41 100644
	--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
	+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
	@@ -1,4072 +1,4086 @@
	//===------- LegalizeVectorTypes.cpp - Legalization of vector types -------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file performs vector type splitting and scalarization for LegalizeTypes.
	// Scalarization is the act of changing a computation in an illegal one-element
	// vector type to be a computation in its scalar element type. For example,
	// implementing <1 x f32> arithmetic in a scalar f32 register. This is needed
	// as a base case when scalarizing vector arithmetic like <4 x f32>, which
	// eventually decomposes to scalars if the target doesn't support v4f32 or v2f32
	// types.
	// Splitting is the act of changing a computation in an invalid vector type to
	// be a computation in two vectors of half the size. For example, implementing
	// <128 x f32> operations in terms of two <64 x f32> operations.
	//
	//===----------------------------------------------------------------------===//

	#include "LegalizeTypes.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/raw_ostream.h"
	using namespace llvm;

	#define DEBUG_TYPE "legalize-types"

	//===----------------------------------------------------------------------===//
	// Result Vector Scalarization: <1 x ty> -> ty.
	//===----------------------------------------------------------------------===//

	void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
	DEBUG(dbgs() << "Scalarize node result " << ResNo << ": ";
	N->dump(&DAG);
	dbgs() << "\n");
	SDValue R = SDValue();

	switch (N->getOpcode()) {
	default:
	#ifndef NDEBUG
	dbgs() << "ScalarizeVectorResult #" << ResNo << ": ";
	N->dump(&DAG);
	dbgs() << "\n";
	#endif
	report_fatal_error("Do not know how to scalarize the result of this "
	"operator!\n");

	case ISD::MERGE_VALUES: R = ScalarizeVecRes_MERGE_VALUES(N, ResNo);break;
	case ISD::BITCAST: R = ScalarizeVecRes_BITCAST(N); break;
	case ISD::BUILD_VECTOR: R = ScalarizeVecRes_BUILD_VECTOR(N); break;
	case ISD::EXTRACT_SUBVECTOR: R = ScalarizeVecRes_EXTRACT_SUBVECTOR(N); break;
	case ISD::FP_ROUND: R = ScalarizeVecRes_FP_ROUND(N); break;
	case ISD::FP_ROUND_INREG: R = ScalarizeVecRes_InregOp(N); break;
	case ISD::FPOWI: R = ScalarizeVecRes_FPOWI(N); break;
	case ISD::INSERT_VECTOR_ELT: R = ScalarizeVecRes_INSERT_VECTOR_ELT(N); break;
	case ISD::LOAD: R = ScalarizeVecRes_LOAD(cast<LoadSDNode>(N));break;
	case ISD::SCALAR_TO_VECTOR: R = ScalarizeVecRes_SCALAR_TO_VECTOR(N); break;
	case ISD::SIGN_EXTEND_INREG: R = ScalarizeVecRes_InregOp(N); break;
	case ISD::VSELECT: R = ScalarizeVecRes_VSELECT(N); break;
	case ISD::SELECT: R = ScalarizeVecRes_SELECT(N); break;
	case ISD::SELECT_CC: R = ScalarizeVecRes_SELECT_CC(N); break;
	case ISD::SETCC: R = ScalarizeVecRes_SETCC(N); break;
	case ISD::UNDEF: R = ScalarizeVecRes_UNDEF(N); break;
	case ISD::VECTOR_SHUFFLE: R = ScalarizeVecRes_VECTOR_SHUFFLE(N); break;
	case ISD::ANY_EXTEND_VECTOR_INREG:
	case ISD::SIGN_EXTEND_VECTOR_INREG:
	case ISD::ZERO_EXTEND_VECTOR_INREG:
	R = ScalarizeVecRes_VecInregOp(N);
	break;
	case ISD::ANY_EXTEND:
	case ISD::BITREVERSE:
	case ISD::BSWAP:
	case ISD::CTLZ:
	case ISD::CTLZ_ZERO_UNDEF:
	case ISD::CTPOP:
	case ISD::CTTZ:
	case ISD::CTTZ_ZERO_UNDEF:
	case ISD::FABS:
	case ISD::FCEIL:
	case ISD::FCOS:
	case ISD::FEXP:
	case ISD::FEXP2:
	case ISD::FFLOOR:
	case ISD::FLOG:
	case ISD::FLOG10:
	case ISD::FLOG2:
	case ISD::FNEARBYINT:
	case ISD::FNEG:
	case ISD::FP_EXTEND:
	case ISD::FP_TO_SINT:
	case ISD::FP_TO_UINT:
	case ISD::FRINT:
	case ISD::FROUND:
	case ISD::FSIN:
	case ISD::FSQRT:
	case ISD::FTRUNC:
	case ISD::SIGN_EXTEND:
	case ISD::SINT_TO_FP:
	case ISD::TRUNCATE:
	case ISD::UINT_TO_FP:
	case ISD::ZERO_EXTEND:
	case ISD::FCANONICALIZE:
	R = ScalarizeVecRes_UnaryOp(N);
	break;

	case ISD::ADD:
	case ISD::AND:
	case ISD::FADD:
	case ISD::FCOPYSIGN:
	case ISD::FDIV:
	case ISD::FMUL:
	case ISD::FMINNUM:
	case ISD::FMAXNUM:
	case ISD::FMINNAN:
	case ISD::FMAXNAN:
	case ISD::SMIN:
	case ISD::SMAX:
	case ISD::UMIN:
	case ISD::UMAX:

	case ISD::FPOW:
	case ISD::FREM:
	case ISD::FSUB:
	case ISD::MUL:
	case ISD::OR:
	case ISD::SDIV:
	case ISD::SREM:
	case ISD::SUB:
	case ISD::UDIV:
	case ISD::UREM:
	case ISD::XOR:
	case ISD::SHL:
	case ISD::SRA:
	case ISD::SRL:
	R = ScalarizeVecRes_BinOp(N);
	break;
	case ISD::FMA:
	R = ScalarizeVecRes_TernaryOp(N);
	break;
	}

	// If R is null, the sub-method took care of registering the result.
	if (R.getNode())
	SetScalarizedVector(SDValue(N, ResNo), R);
	}

	SDValue DAGTypeLegalizer::ScalarizeVecRes_BinOp(SDNode *N) {
	SDValue LHS = GetScalarizedVector(N->getOperand(0));
	SDValue RHS = GetScalarizedVector(N->getOperand(1));
	return DAG.getNode(N->getOpcode(), SDLoc(N),
	LHS.getValueType(), LHS, RHS, N->getFlags());
	}

	SDValue DAGTypeLegalizer::ScalarizeVecRes_TernaryOp(SDNode *N) {
	SDValue Op0 = GetScalarizedVector(N->getOperand(0));
	SDValue Op1 = GetScalarizedVector(N->getOperand(1));
	SDValue Op2 = GetScalarizedVector(N->getOperand(2));
	return DAG.getNode(N->getOpcode(), SDLoc(N),
	Op0.getValueType(), Op0, Op1, Op2);
	}

	SDValue DAGTypeLegalizer::ScalarizeVecRes_MERGE_VALUES(SDNode *N,
	unsigned ResNo) {
	SDValue Op = DisintegrateMERGE_VALUES(N, ResNo);
	return GetScalarizedVector(Op);
	}

	SDValue DAGTypeLegalizer::ScalarizeVecRes_BITCAST(SDNode *N) {
	EVT NewVT = N->getValueType(0).getVectorElementType();
	return DAG.getNode(ISD::BITCAST, SDLoc(N),
	NewVT, N->getOperand(0));
	}

	SDValue DAGTypeLegalizer::ScalarizeVecRes_BUILD_VECTOR(SDNode *N) {
	EVT EltVT = N->getValueType(0).getVectorElementType();
	SDValue InOp = N->getOperand(0);
	// The BUILD_VECTOR operands may be of wider element types and
	// we may need to truncate them back to the requested return type.
	if (EltVT.isInteger())
	return DAG.getNode(ISD::TRUNCATE, SDLoc(N), EltVT, InOp);
	return InOp;
	}

	SDValue DAGTypeLegalizer::ScalarizeVecRes_EXTRACT_SUBVECTOR(SDNode *N) {
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N),
	N->getValueType(0).getVectorElementType(),
	N->getOperand(0), N->getOperand(1));
	}

	SDValue DAGTypeLegalizer::ScalarizeVecRes_FP_ROUND(SDNode *N) {
	EVT NewVT = N->getValueType(0).getVectorElementType();
	SDValue Op = GetScalarizedVector(N->getOperand(0));
	return DAG.getNode(ISD::FP_ROUND, SDLoc(N),
	NewVT, Op, N->getOperand(1));
	}

	SDValue DAGTypeLegalizer::ScalarizeVecRes_FPOWI(SDNode *N) {
	SDValue Op = GetScalarizedVector(N->getOperand(0));
	return DAG.getNode(ISD::FPOWI, SDLoc(N),
	Op.getValueType(), Op, N->getOperand(1));
	}

	SDValue DAGTypeLegalizer::ScalarizeVecRes_INSERT_VECTOR_ELT(SDNode *N) {
	// The value to insert may have a wider type than the vector element type,
	// so be sure to truncate it to the element type if necessary.
	SDValue Op = N->getOperand(1);
	EVT EltVT = N->getValueType(0).getVectorElementType();
	if (Op.getValueType() != EltVT)
	// FIXME: Can this happen for floating point types?
	Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), EltVT, Op);
	return Op;
	}

	SDValue DAGTypeLegalizer::ScalarizeVecRes_LOAD(LoadSDNode *N) {
	assert(N->isUnindexed() && "Indexed vector load?");

	SDValue Result = DAG.getLoad(
	ISD::UNINDEXED, N->getExtensionType(),
	N->getValueType(0).getVectorElementType(), SDLoc(N), N->getChain(),
	N->getBasePtr(), DAG.getUNDEF(N->getBasePtr().getValueType()),
	N->getPointerInfo(), N->getMemoryVT().getVectorElementType(),
	N->getOriginalAlignment(), N->getMemOperand()->getFlags(),
	N->getAAInfo());

	// Legalize the chain result - switch anything that used the old chain to
	// use the new one.
	ReplaceValueWith(SDValue(N, 1), Result.getValue(1));
	return Result;
	}

	SDValue DAGTypeLegalizer::ScalarizeVecRes_UnaryOp(SDNode *N) {
	// Get the dest type - it doesn't always match the input type, e.g. int_to_fp.
	EVT DestVT = N->getValueType(0).getVectorElementType();
	SDValue Op = N->getOperand(0);
	EVT OpVT = Op.getValueType();
	SDLoc DL(N);
	// The result needs scalarizing, but it's not a given that the source does.
	// This is a workaround for targets where it's impossible to scalarize the
	// result of a conversion, because the source type is legal.
	// For instance, this happens on AArch64: v1i1 is illegal but v1i{8,16,32}
	// are widened to v8i8, v4i16, and v2i32, which is legal, because v1i64 is
	// legal and was not scalarized.
	// See the similar logic in ScalarizeVecRes_VSETCC
	if (getTypeAction(OpVT) == TargetLowering::TypeScalarizeVector) {
	Op = GetScalarizedVector(Op);
	} else {
	EVT VT = OpVT.getVectorElementType();
	Op = DAG.getNode(
	ISD::EXTRACT_VECTOR_ELT, DL, VT, Op,
	DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
	}
	return DAG.getNode(N->getOpcode(), SDLoc(N), DestVT, Op);
	}

	SDValue DAGTypeLegalizer::ScalarizeVecRes_InregOp(SDNode *N) {
	EVT EltVT = N->getValueType(0).getVectorElementType();
	EVT ExtVT = cast<VTSDNode>(N->getOperand(1))->getVT().getVectorElementType();
	SDValue LHS = GetScalarizedVector(N->getOperand(0));
	return DAG.getNode(N->getOpcode(), SDLoc(N), EltVT,
	LHS, DAG.getValueType(ExtVT));
	}

	SDValue DAGTypeLegalizer::ScalarizeVecRes_VecInregOp(SDNode *N) {
	SDLoc DL(N);
	SDValue Op = N->getOperand(0);

	EVT OpVT = Op.getValueType();
	EVT OpEltVT = OpVT.getVectorElementType();
	EVT EltVT = N->getValueType(0).getVectorElementType();

	if (getTypeAction(OpVT) == TargetLowering::TypeScalarizeVector) {
	Op = GetScalarizedVector(Op);
	} else {
	Op = DAG.getNode(
	ISD::EXTRACT_VECTOR_ELT, DL, OpEltVT, Op,
	DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
	}

	switch (N->getOpcode()) {
	case ISD::ANY_EXTEND_VECTOR_INREG:
	return DAG.getNode(ISD::ANY_EXTEND, DL, EltVT, Op);
	case ISD::SIGN_EXTEND_VECTOR_INREG:
	return DAG.getNode(ISD::SIGN_EXTEND, DL, EltVT, Op);
	case ISD::ZERO_EXTEND_VECTOR_INREG:
	return DAG.getNode(ISD::ZERO_EXTEND, DL, EltVT, Op);
	}

	llvm_unreachable("Illegal extend_vector_inreg opcode");
	}

	SDValue DAGTypeLegalizer::ScalarizeVecRes_SCALAR_TO_VECTOR(SDNode *N) {
	// If the operand is wider than the vector element type then it is implicitly
	// truncated. Make that explicit here.
	EVT EltVT = N->getValueType(0).getVectorElementType();
	SDValue InOp = N->getOperand(0);
	if (InOp.getValueType() != EltVT)
	return DAG.getNode(ISD::TRUNCATE, SDLoc(N), EltVT, InOp);
	return InOp;
	}

	SDValue DAGTypeLegalizer::ScalarizeVecRes_VSELECT(SDNode *N) {
	- SDValue Cond = GetScalarizedVector(N->getOperand(0));
	+ SDValue Cond = N->getOperand(0);
	+ EVT OpVT = Cond.getValueType();
	+ SDLoc DL(N);
	+ // The vselect result and true/value operands needs scalarizing, but it's
	+ // not a given that the Cond does. For instance, in AVX512 v1i1 is legal.
	+ // See the similar logic in ScalarizeVecRes_VSETCC
	+ if (getTypeAction(OpVT) == TargetLowering::TypeScalarizeVector) {
	+ Cond = GetScalarizedVector(Cond);
	+ } else {
	+ EVT VT = OpVT.getVectorElementType();
	+ Cond = DAG.getNode(
	+ ISD::EXTRACT_VECTOR_ELT, DL, VT, Cond,
	+ DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
	+ }
	+
	SDValue LHS = GetScalarizedVector(N->getOperand(1));
	TargetLowering::BooleanContent ScalarBool =
	TLI.getBooleanContents(false, false);
	TargetLowering::BooleanContent VecBool = TLI.getBooleanContents(true, false);

	// If integer and float booleans have different contents then we can't
	// reliably optimize in all cases. There is a full explanation for this in
	// DAGCombiner::visitSELECT() where the same issue affects folding
	// (select C, 0, 1) to (xor C, 1).
	if (TLI.getBooleanContents(false, false) !=
	TLI.getBooleanContents(false, true)) {
	// At least try the common case where the boolean is generated by a
	// comparison.
	if (Cond->getOpcode() == ISD::SETCC) {
	EVT OpVT = Cond->getOperand(0)->getValueType(0);
	ScalarBool = TLI.getBooleanContents(OpVT.getScalarType());
	VecBool = TLI.getBooleanContents(OpVT);
	} else
	ScalarBool = TargetLowering::UndefinedBooleanContent;
	}

	if (ScalarBool != VecBool) {
	EVT CondVT = Cond.getValueType();
	switch (ScalarBool) {
	case TargetLowering::UndefinedBooleanContent:
	break;
	case TargetLowering::ZeroOrOneBooleanContent:
	assert(VecBool == TargetLowering::UndefinedBooleanContent \|\|
	VecBool == TargetLowering::ZeroOrNegativeOneBooleanContent);
	// Vector read from all ones, scalar expects a single 1 so mask.
	Cond = DAG.getNode(ISD::AND, SDLoc(N), CondVT,
	Cond, DAG.getConstant(1, SDLoc(N), CondVT));
	break;
	case TargetLowering::ZeroOrNegativeOneBooleanContent:
	assert(VecBool == TargetLowering::UndefinedBooleanContent \|\|
	VecBool == TargetLowering::ZeroOrOneBooleanContent);
	// Vector reads from a one, scalar from all ones so sign extend.
	Cond = DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), CondVT,
	Cond, DAG.getValueType(MVT::i1));
	break;
	}
	}

	return DAG.getSelect(SDLoc(N),
	LHS.getValueType(), Cond, LHS,
	GetScalarizedVector(N->getOperand(2)));
	}

	SDValue DAGTypeLegalizer::ScalarizeVecRes_SELECT(SDNode *N) {
	SDValue LHS = GetScalarizedVector(N->getOperand(1));
	return DAG.getSelect(SDLoc(N),
	LHS.getValueType(), N->getOperand(0), LHS,
	GetScalarizedVector(N->getOperand(2)));
	}

	SDValue DAGTypeLegalizer::ScalarizeVecRes_SELECT_CC(SDNode *N) {
	SDValue LHS = GetScalarizedVector(N->getOperand(2));
	return DAG.getNode(ISD::SELECT_CC, SDLoc(N), LHS.getValueType(),
	N->getOperand(0), N->getOperand(1),
	LHS, GetScalarizedVector(N->getOperand(3)),
	N->getOperand(4));
	}

	SDValue DAGTypeLegalizer::ScalarizeVecRes_SETCC(SDNode *N) {
	assert(N->getValueType(0).isVector() ==
	N->getOperand(0).getValueType().isVector() &&
	"Scalar/Vector type mismatch");

	if (N->getValueType(0).isVector()) return ScalarizeVecRes_VSETCC(N);

	SDValue LHS = GetScalarizedVector(N->getOperand(0));
	SDValue RHS = GetScalarizedVector(N->getOperand(1));
	SDLoc DL(N);

	// Turn it into a scalar SETCC.
	return DAG.getNode(ISD::SETCC, DL, MVT::i1, LHS, RHS, N->getOperand(2));
	}

	SDValue DAGTypeLegalizer::ScalarizeVecRes_UNDEF(SDNode *N) {
	return DAG.getUNDEF(N->getValueType(0).getVectorElementType());
	}

	SDValue DAGTypeLegalizer::ScalarizeVecRes_VECTOR_SHUFFLE(SDNode *N) {
	// Figure out if the scalar is the LHS or RHS and return it.
	SDValue Arg = N->getOperand(2).getOperand(0);
	if (Arg.isUndef())
	return DAG.getUNDEF(N->getValueType(0).getVectorElementType());
	unsigned Op = !cast<ConstantSDNode>(Arg)->isNullValue();
	return GetScalarizedVector(N->getOperand(Op));
	}

	SDValue DAGTypeLegalizer::ScalarizeVecRes_VSETCC(SDNode *N) {
	assert(N->getValueType(0).isVector() &&
	N->getOperand(0).getValueType().isVector() &&
	"Operand types must be vectors");
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);
	EVT OpVT = LHS.getValueType();
	EVT NVT = N->getValueType(0).getVectorElementType();
	SDLoc DL(N);

	// The result needs scalarizing, but it's not a given that the source does.
	if (getTypeAction(OpVT) == TargetLowering::TypeScalarizeVector) {
	LHS = GetScalarizedVector(LHS);
	RHS = GetScalarizedVector(RHS);
	} else {
	EVT VT = OpVT.getVectorElementType();
	LHS = DAG.getNode(
	ISD::EXTRACT_VECTOR_ELT, DL, VT, LHS,
	DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
	RHS = DAG.getNode(
	ISD::EXTRACT_VECTOR_ELT, DL, VT, RHS,
	DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
	}

	// Turn it into a scalar SETCC.
	SDValue Res = DAG.getNode(ISD::SETCC, DL, MVT::i1, LHS, RHS,
	N->getOperand(2));
	// Vectors may have a different boolean contents to scalars. Promote the
	// value appropriately.
	ISD::NodeType ExtendCode =
	TargetLowering::getExtendForContent(TLI.getBooleanContents(OpVT));
	return DAG.getNode(ExtendCode, DL, NVT, Res);
	}


	//===----------------------------------------------------------------------===//
	// Operand Vector Scalarization <1 x ty> -> ty.
	//===----------------------------------------------------------------------===//

	bool DAGTypeLegalizer::ScalarizeVectorOperand(SDNode *N, unsigned OpNo) {
	DEBUG(dbgs() << "Scalarize node operand " << OpNo << ": ";
	N->dump(&DAG);
	dbgs() << "\n");
	SDValue Res = SDValue();

	if (!Res.getNode()) {
	switch (N->getOpcode()) {
	default:
	#ifndef NDEBUG
	dbgs() << "ScalarizeVectorOperand Op #" << OpNo << ": ";
	N->dump(&DAG);
	dbgs() << "\n";
	#endif
	llvm_unreachable("Do not know how to scalarize this operator's operand!");
	case ISD::BITCAST:
	Res = ScalarizeVecOp_BITCAST(N);
	break;
	case ISD::ANY_EXTEND:
	case ISD::ZERO_EXTEND:
	case ISD::SIGN_EXTEND:
	case ISD::TRUNCATE:
	case ISD::FP_TO_SINT:
	case ISD::FP_TO_UINT:
	case ISD::SINT_TO_FP:
	case ISD::UINT_TO_FP:
	Res = ScalarizeVecOp_UnaryOp(N);
	break;
	case ISD::CONCAT_VECTORS:
	Res = ScalarizeVecOp_CONCAT_VECTORS(N);
	break;
	case ISD::EXTRACT_VECTOR_ELT:
	Res = ScalarizeVecOp_EXTRACT_VECTOR_ELT(N);
	break;
	case ISD::VSELECT:
	Res = ScalarizeVecOp_VSELECT(N);
	break;
	case ISD::STORE:
	Res = ScalarizeVecOp_STORE(cast<StoreSDNode>(N), OpNo);
	break;
	case ISD::FP_ROUND:
	Res = ScalarizeVecOp_FP_ROUND(N, OpNo);
	break;
	}
	}

	// If the result is null, the sub-method took care of registering results etc.
	if (!Res.getNode()) return false;

	// If the result is N, the sub-method updated N in place. Tell the legalizer
	// core about this.
	if (Res.getNode() == N)
	return true;

	assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 &&
	"Invalid operand expansion");

	ReplaceValueWith(SDValue(N, 0), Res);
	return false;
	}

	/// If the value to convert is a vector that needs to be scalarized, it must be
	/// <1 x ty>. Convert the element instead.
	SDValue DAGTypeLegalizer::ScalarizeVecOp_BITCAST(SDNode *N) {
	SDValue Elt = GetScalarizedVector(N->getOperand(0));
	return DAG.getNode(ISD::BITCAST, SDLoc(N),
	N->getValueType(0), Elt);
	}

	/// If the input is a vector that needs to be scalarized, it must be <1 x ty>.
	/// Do the operation on the element instead.
	SDValue DAGTypeLegalizer::ScalarizeVecOp_UnaryOp(SDNode *N) {
	assert(N->getValueType(0).getVectorNumElements() == 1 &&
	"Unexpected vector type!");
	SDValue Elt = GetScalarizedVector(N->getOperand(0));
	SDValue Op = DAG.getNode(N->getOpcode(), SDLoc(N),
	N->getValueType(0).getScalarType(), Elt);
	// Revectorize the result so the types line up with what the uses of this
	// expression expect.
	return DAG.getBuildVector(N->getValueType(0), SDLoc(N), Op);
	}

	/// The vectors to concatenate have length one - use a BUILD_VECTOR instead.
	SDValue DAGTypeLegalizer::ScalarizeVecOp_CONCAT_VECTORS(SDNode *N) {
	SmallVector<SDValue, 8> Ops(N->getNumOperands());
	for (unsigned i = 0, e = N->getNumOperands(); i < e; ++i)
	Ops[i] = GetScalarizedVector(N->getOperand(i));
	return DAG.getBuildVector(N->getValueType(0), SDLoc(N), Ops);
	}

	/// If the input is a vector that needs to be scalarized, it must be <1 x ty>,
	/// so just return the element, ignoring the index.
	SDValue DAGTypeLegalizer::ScalarizeVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
	EVT VT = N->getValueType(0);
	SDValue Res = GetScalarizedVector(N->getOperand(0));
	if (Res.getValueType() != VT)
	Res = VT.isFloatingPoint()
	? DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, Res)
	: DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, Res);
	return Res;
	}

	/// If the input condition is a vector that needs to be scalarized, it must be
	/// <1 x i1>, so just convert to a normal ISD::SELECT
	/// (still with vector output type since that was acceptable if we got here).
	SDValue DAGTypeLegalizer::ScalarizeVecOp_VSELECT(SDNode *N) {
	SDValue ScalarCond = GetScalarizedVector(N->getOperand(0));
	EVT VT = N->getValueType(0);

	return DAG.getNode(ISD::SELECT, SDLoc(N), VT, ScalarCond, N->getOperand(1),
	N->getOperand(2));
	}

	/// If the value to store is a vector that needs to be scalarized, it must be
	/// <1 x ty>. Just store the element.
	SDValue DAGTypeLegalizer::ScalarizeVecOp_STORE(StoreSDNode *N, unsigned OpNo){
	assert(N->isUnindexed() && "Indexed store of one-element vector?");
	assert(OpNo == 1 && "Do not know how to scalarize this operand!");
	SDLoc dl(N);

	if (N->isTruncatingStore())
	return DAG.getTruncStore(
	N->getChain(), dl, GetScalarizedVector(N->getOperand(1)),
	N->getBasePtr(), N->getPointerInfo(),
	N->getMemoryVT().getVectorElementType(), N->getAlignment(),
	N->getMemOperand()->getFlags(), N->getAAInfo());

	return DAG.getStore(N->getChain(), dl, GetScalarizedVector(N->getOperand(1)),
	N->getBasePtr(), N->getPointerInfo(),
	N->getOriginalAlignment(), N->getMemOperand()->getFlags(),
	N->getAAInfo());
	}

	/// If the value to round is a vector that needs to be scalarized, it must be
	/// <1 x ty>. Convert the element instead.
	SDValue DAGTypeLegalizer::ScalarizeVecOp_FP_ROUND(SDNode *N, unsigned OpNo) {
	SDValue Elt = GetScalarizedVector(N->getOperand(0));
	SDValue Res = DAG.getNode(ISD::FP_ROUND, SDLoc(N),
	N->getValueType(0).getVectorElementType(), Elt,
	N->getOperand(1));
	return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), N->getValueType(0), Res);
	}

	//===----------------------------------------------------------------------===//
	// Result Vector Splitting
	//===----------------------------------------------------------------------===//

	/// This method is called when the specified result of the specified node is
	/// found to need vector splitting. At this point, the node may also have
	/// invalid operands or may have other results that need legalization, we just
	/// know that (at least) one result needs vector splitting.
	void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
	DEBUG(dbgs() << "Split node result: ";
	N->dump(&DAG);
	dbgs() << "\n");
	SDValue Lo, Hi;

	// See if the target wants to custom expand this node.
	if (CustomLowerNode(N, N->getValueType(ResNo), true))
	return;

	switch (N->getOpcode()) {
	default:
	#ifndef NDEBUG
	dbgs() << "SplitVectorResult #" << ResNo << ": ";
	N->dump(&DAG);
	dbgs() << "\n";
	#endif
	report_fatal_error("Do not know how to split the result of this "
	"operator!\n");

	case ISD::MERGE_VALUES: SplitRes_MERGE_VALUES(N, ResNo, Lo, Hi); break;
	case ISD::VSELECT:
	case ISD::SELECT: SplitRes_SELECT(N, Lo, Hi); break;
	case ISD::SELECT_CC: SplitRes_SELECT_CC(N, Lo, Hi); break;
	case ISD::UNDEF: SplitRes_UNDEF(N, Lo, Hi); break;
	case ISD::BITCAST: SplitVecRes_BITCAST(N, Lo, Hi); break;
	case ISD::BUILD_VECTOR: SplitVecRes_BUILD_VECTOR(N, Lo, Hi); break;
	case ISD::CONCAT_VECTORS: SplitVecRes_CONCAT_VECTORS(N, Lo, Hi); break;
	case ISD::EXTRACT_SUBVECTOR: SplitVecRes_EXTRACT_SUBVECTOR(N, Lo, Hi); break;
	case ISD::INSERT_SUBVECTOR: SplitVecRes_INSERT_SUBVECTOR(N, Lo, Hi); break;
	case ISD::FP_ROUND_INREG: SplitVecRes_InregOp(N, Lo, Hi); break;
	case ISD::FPOWI: SplitVecRes_FPOWI(N, Lo, Hi); break;
	case ISD::FCOPYSIGN: SplitVecRes_FCOPYSIGN(N, Lo, Hi); break;
	case ISD::INSERT_VECTOR_ELT: SplitVecRes_INSERT_VECTOR_ELT(N, Lo, Hi); break;
	case ISD::SCALAR_TO_VECTOR: SplitVecRes_SCALAR_TO_VECTOR(N, Lo, Hi); break;
	case ISD::SIGN_EXTEND_INREG: SplitVecRes_InregOp(N, Lo, Hi); break;
	case ISD::LOAD:
	SplitVecRes_LOAD(cast<LoadSDNode>(N), Lo, Hi);
	break;
	case ISD::MLOAD:
	SplitVecRes_MLOAD(cast<MaskedLoadSDNode>(N), Lo, Hi);
	break;
	case ISD::MGATHER:
	SplitVecRes_MGATHER(cast<MaskedGatherSDNode>(N), Lo, Hi);
	break;
	case ISD::SETCC:
	SplitVecRes_SETCC(N, Lo, Hi);
	break;
	case ISD::VECTOR_SHUFFLE:
	SplitVecRes_VECTOR_SHUFFLE(cast<ShuffleVectorSDNode>(N), Lo, Hi);
	break;

	case ISD::ANY_EXTEND_VECTOR_INREG:
	case ISD::SIGN_EXTEND_VECTOR_INREG:
	case ISD::ZERO_EXTEND_VECTOR_INREG:
	SplitVecRes_ExtVecInRegOp(N, Lo, Hi);
	break;

	case ISD::BITREVERSE:
	case ISD::BSWAP:
	case ISD::CTLZ:
	case ISD::CTTZ:
	case ISD::CTLZ_ZERO_UNDEF:
	case ISD::CTTZ_ZERO_UNDEF:
	case ISD::CTPOP:
	case ISD::FABS:
	case ISD::FCEIL:
	case ISD::FCOS:
	case ISD::FEXP:
	case ISD::FEXP2:
	case ISD::FFLOOR:
	case ISD::FLOG:
	case ISD::FLOG10:
	case ISD::FLOG2:
	case ISD::FNEARBYINT:
	case ISD::FNEG:
	case ISD::FP_EXTEND:
	case ISD::FP_ROUND:
	case ISD::FP_TO_SINT:
	case ISD::FP_TO_UINT:
	case ISD::FRINT:
	case ISD::FROUND:
	case ISD::FSIN:
	case ISD::FSQRT:
	case ISD::FTRUNC:
	case ISD::SINT_TO_FP:
	case ISD::TRUNCATE:
	case ISD::UINT_TO_FP:
	case ISD::FCANONICALIZE:
	SplitVecRes_UnaryOp(N, Lo, Hi);
	break;

	case ISD::ANY_EXTEND:
	case ISD::SIGN_EXTEND:
	case ISD::ZERO_EXTEND:
	SplitVecRes_ExtendOp(N, Lo, Hi);
	break;

	case ISD::ADD:
	case ISD::SUB:
	case ISD::MUL:
	case ISD::MULHS:
	case ISD::MULHU:
	case ISD::FADD:
	case ISD::FSUB:
	case ISD::FMUL:
	case ISD::FMINNUM:
	case ISD::FMAXNUM:
	case ISD::FMINNAN:
	case ISD::FMAXNAN:
	case ISD::SDIV:
	case ISD::UDIV:
	case ISD::FDIV:
	case ISD::FPOW:
	case ISD::AND:
	case ISD::OR:
	case ISD::XOR:
	case ISD::SHL:
	case ISD::SRA:
	case ISD::SRL:
	case ISD::UREM:
	case ISD::SREM:
	case ISD::FREM:
	case ISD::SMIN:
	case ISD::SMAX:
	case ISD::UMIN:
	case ISD::UMAX:
	SplitVecRes_BinOp(N, Lo, Hi);
	break;
	case ISD::FMA:
	SplitVecRes_TernaryOp(N, Lo, Hi);
	break;
	}

	// If Lo/Hi is null, the sub-method took care of registering results etc.
	if (Lo.getNode())
	SetSplitVector(SDValue(N, ResNo), Lo, Hi);
	}

	void DAGTypeLegalizer::SplitVecRes_BinOp(SDNode *N, SDValue &Lo,
	SDValue &Hi) {
	SDValue LHSLo, LHSHi;
	GetSplitVector(N->getOperand(0), LHSLo, LHSHi);
	SDValue RHSLo, RHSHi;
	GetSplitVector(N->getOperand(1), RHSLo, RHSHi);
	SDLoc dl(N);

	const SDNodeFlags Flags = N->getFlags();
	unsigned Opcode = N->getOpcode();
	Lo = DAG.getNode(Opcode, dl, LHSLo.getValueType(), LHSLo, RHSLo, Flags);
	Hi = DAG.getNode(Opcode, dl, LHSHi.getValueType(), LHSHi, RHSHi, Flags);
	}

	void DAGTypeLegalizer::SplitVecRes_TernaryOp(SDNode *N, SDValue &Lo,
	SDValue &Hi) {
	SDValue Op0Lo, Op0Hi;
	GetSplitVector(N->getOperand(0), Op0Lo, Op0Hi);
	SDValue Op1Lo, Op1Hi;
	GetSplitVector(N->getOperand(1), Op1Lo, Op1Hi);
	SDValue Op2Lo, Op2Hi;
	GetSplitVector(N->getOperand(2), Op2Lo, Op2Hi);
	SDLoc dl(N);

	Lo = DAG.getNode(N->getOpcode(), dl, Op0Lo.getValueType(),
	Op0Lo, Op1Lo, Op2Lo);
	Hi = DAG.getNode(N->getOpcode(), dl, Op0Hi.getValueType(),
	Op0Hi, Op1Hi, Op2Hi);
	}

	void DAGTypeLegalizer::SplitVecRes_BITCAST(SDNode *N, SDValue &Lo,
	SDValue &Hi) {
	// We know the result is a vector. The input may be either a vector or a
	// scalar value.
	EVT LoVT, HiVT;
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
	SDLoc dl(N);

	SDValue InOp = N->getOperand(0);
	EVT InVT = InOp.getValueType();

	// Handle some special cases efficiently.
	switch (getTypeAction(InVT)) {
	case TargetLowering::TypeLegal:
	case TargetLowering::TypePromoteInteger:
	case TargetLowering::TypePromoteFloat:
	case TargetLowering::TypeSoftenFloat:
	case TargetLowering::TypeScalarizeVector:
	case TargetLowering::TypeWidenVector:
	break;
	case TargetLowering::TypeExpandInteger:
	case TargetLowering::TypeExpandFloat:
	// A scalar to vector conversion, where the scalar needs expansion.
	// If the vector is being split in two then we can just convert the
	// expanded pieces.
	if (LoVT == HiVT) {
	GetExpandedOp(InOp, Lo, Hi);
	if (DAG.getDataLayout().isBigEndian())
	std::swap(Lo, Hi);
	Lo = DAG.getNode(ISD::BITCAST, dl, LoVT, Lo);
	Hi = DAG.getNode(ISD::BITCAST, dl, HiVT, Hi);
	return;
	}
	break;
	case TargetLowering::TypeSplitVector:
	// If the input is a vector that needs to be split, convert each split
	// piece of the input now.
	GetSplitVector(InOp, Lo, Hi);
	Lo = DAG.getNode(ISD::BITCAST, dl, LoVT, Lo);
	Hi = DAG.getNode(ISD::BITCAST, dl, HiVT, Hi);
	return;
	}

	// In the general case, convert the input to an integer and split it by hand.
	EVT LoIntVT = EVT::getIntegerVT(*DAG.getContext(), LoVT.getSizeInBits());
	EVT HiIntVT = EVT::getIntegerVT(*DAG.getContext(), HiVT.getSizeInBits());
	if (DAG.getDataLayout().isBigEndian())
	std::swap(LoIntVT, HiIntVT);

	SplitInteger(BitConvertToInteger(InOp), LoIntVT, HiIntVT, Lo, Hi);

	if (DAG.getDataLayout().isBigEndian())
	std::swap(Lo, Hi);
	Lo = DAG.getNode(ISD::BITCAST, dl, LoVT, Lo);
	Hi = DAG.getNode(ISD::BITCAST, dl, HiVT, Hi);
	}

	void DAGTypeLegalizer::SplitVecRes_BUILD_VECTOR(SDNode *N, SDValue &Lo,
	SDValue &Hi) {
	EVT LoVT, HiVT;
	SDLoc dl(N);
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
	unsigned LoNumElts = LoVT.getVectorNumElements();
	SmallVector<SDValue, 8> LoOps(N->op_begin(), N->op_begin()+LoNumElts);
	Lo = DAG.getBuildVector(LoVT, dl, LoOps);

	SmallVector<SDValue, 8> HiOps(N->op_begin()+LoNumElts, N->op_end());
	Hi = DAG.getBuildVector(HiVT, dl, HiOps);
	}

	void DAGTypeLegalizer::SplitVecRes_CONCAT_VECTORS(SDNode *N, SDValue &Lo,
	SDValue &Hi) {
	assert(!(N->getNumOperands() & 1) && "Unsupported CONCAT_VECTORS");
	SDLoc dl(N);
	unsigned NumSubvectors = N->getNumOperands() / 2;
	if (NumSubvectors == 1) {
	Lo = N->getOperand(0);
	Hi = N->getOperand(1);
	return;
	}

	EVT LoVT, HiVT;
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));

	SmallVector<SDValue, 8> LoOps(N->op_begin(), N->op_begin()+NumSubvectors);
	Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, LoVT, LoOps);

	SmallVector<SDValue, 8> HiOps(N->op_begin()+NumSubvectors, N->op_end());
	Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HiVT, HiOps);
	}

	void DAGTypeLegalizer::SplitVecRes_EXTRACT_SUBVECTOR(SDNode *N, SDValue &Lo,
	SDValue &Hi) {
	SDValue Vec = N->getOperand(0);
	SDValue Idx = N->getOperand(1);
	SDLoc dl(N);

	EVT LoVT, HiVT;
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));

	Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, LoVT, Vec, Idx);
	uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
	Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HiVT, Vec,
	DAG.getConstant(IdxVal + LoVT.getVectorNumElements(), dl,
	TLI.getVectorIdxTy(DAG.getDataLayout())));
	}

	void DAGTypeLegalizer::SplitVecRes_INSERT_SUBVECTOR(SDNode *N, SDValue &Lo,
	SDValue &Hi) {
	SDValue Vec = N->getOperand(0);
	SDValue SubVec = N->getOperand(1);
	SDValue Idx = N->getOperand(2);
	SDLoc dl(N);
	GetSplitVector(Vec, Lo, Hi);

	EVT VecVT = Vec.getValueType();
	unsigned VecElems = VecVT.getVectorNumElements();
	unsigned SubElems = SubVec.getValueType().getVectorNumElements();

	// If we know the index is 0, and we know the subvector doesn't cross the
	// boundary between the halves, we can avoid spilling the vector, and insert
	// into the lower half of the split vector directly.
	// TODO: The IdxVal == 0 constraint is artificial, we could do this whenever
	// the index is constant and there is no boundary crossing. But those cases
	// don't seem to get hit in practice.
	if (ConstantSDNode *ConstIdx = dyn_cast<ConstantSDNode>(Idx)) {
	unsigned IdxVal = ConstIdx->getZExtValue();
	if ((IdxVal == 0) && (IdxVal + SubElems <= VecElems / 2)) {
	EVT LoVT, HiVT;
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
	Lo = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, LoVT, Lo, SubVec, Idx);
	return;
	}
	}

	// Spill the vector to the stack.
	SDValue StackPtr = DAG.CreateStackTemporary(VecVT);
	SDValue Store =
	DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, MachinePointerInfo());

	// Store the new subvector into the specified index.
	SDValue SubVecPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
	Type VecType = VecVT.getTypeForEVT(DAG.getContext());
	unsigned Alignment = DAG.getDataLayout().getPrefTypeAlignment(VecType);
	Store = DAG.getStore(Store, dl, SubVec, SubVecPtr, MachinePointerInfo());

	// Load the Lo part from the stack slot.
	Lo =
	DAG.getLoad(Lo.getValueType(), dl, Store, StackPtr, MachinePointerInfo());

	// Increment the pointer to the other part.
	unsigned IncrementSize = Lo.getValueSizeInBits() / 8;
	StackPtr =
	DAG.getNode(ISD::ADD, dl, StackPtr.getValueType(), StackPtr,
	DAG.getConstant(IncrementSize, dl, StackPtr.getValueType()));

	// Load the Hi part from the stack slot.
	Hi = DAG.getLoad(Hi.getValueType(), dl, Store, StackPtr, MachinePointerInfo(),
	MinAlign(Alignment, IncrementSize));
	}

	void DAGTypeLegalizer::SplitVecRes_FPOWI(SDNode *N, SDValue &Lo,
	SDValue &Hi) {
	SDLoc dl(N);
	GetSplitVector(N->getOperand(0), Lo, Hi);
	Lo = DAG.getNode(ISD::FPOWI, dl, Lo.getValueType(), Lo, N->getOperand(1));
	Hi = DAG.getNode(ISD::FPOWI, dl, Hi.getValueType(), Hi, N->getOperand(1));
	}

	void DAGTypeLegalizer::SplitVecRes_FCOPYSIGN(SDNode *N, SDValue &Lo,
	SDValue &Hi) {
	SDValue LHSLo, LHSHi;
	GetSplitVector(N->getOperand(0), LHSLo, LHSHi);
	SDLoc DL(N);

	SDValue RHSLo, RHSHi;
	SDValue RHS = N->getOperand(1);
	EVT RHSVT = RHS.getValueType();
	if (getTypeAction(RHSVT) == TargetLowering::TypeSplitVector)
	GetSplitVector(RHS, RHSLo, RHSHi);
	else
	std::tie(RHSLo, RHSHi) = DAG.SplitVector(RHS, SDLoc(RHS));


	Lo = DAG.getNode(ISD::FCOPYSIGN, DL, LHSLo.getValueType(), LHSLo, RHSLo);
	Hi = DAG.getNode(ISD::FCOPYSIGN, DL, LHSHi.getValueType(), LHSHi, RHSHi);
	}

	void DAGTypeLegalizer::SplitVecRes_InregOp(SDNode *N, SDValue &Lo,
	SDValue &Hi) {
	SDValue LHSLo, LHSHi;
	GetSplitVector(N->getOperand(0), LHSLo, LHSHi);
	SDLoc dl(N);

	EVT LoVT, HiVT;
	std::tie(LoVT, HiVT) =
	DAG.GetSplitDestVTs(cast<VTSDNode>(N->getOperand(1))->getVT());

	Lo = DAG.getNode(N->getOpcode(), dl, LHSLo.getValueType(), LHSLo,
	DAG.getValueType(LoVT));
	Hi = DAG.getNode(N->getOpcode(), dl, LHSHi.getValueType(), LHSHi,
	DAG.getValueType(HiVT));
	}

	void DAGTypeLegalizer::SplitVecRes_ExtVecInRegOp(SDNode *N, SDValue &Lo,
	SDValue &Hi) {
	unsigned Opcode = N->getOpcode();
	SDValue N0 = N->getOperand(0);

	SDLoc dl(N);
	SDValue InLo, InHi;

	if (getTypeAction(N0.getValueType()) == TargetLowering::TypeSplitVector)
	GetSplitVector(N0, InLo, InHi);
	else
	std::tie(InLo, InHi) = DAG.SplitVectorOperand(N, 0);

	EVT InLoVT = InLo.getValueType();
	unsigned InNumElements = InLoVT.getVectorNumElements();

	EVT OutLoVT, OutHiVT;
	std::tie(OutLoVT, OutHiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
	unsigned OutNumElements = OutLoVT.getVectorNumElements();
	assert((2 * OutNumElements) <= InNumElements &&
	"Illegal extend vector in reg split");

	// *_EXTEND_VECTOR_INREG instructions extend the lowest elements of the
	// input vector (i.e. we only use InLo):
	// OutLo will extend the first OutNumElements from InLo.
	// OutHi will extend the next OutNumElements from InLo.

	// Shuffle the elements from InLo for OutHi into the bottom elements to
	// create a 'fake' InHi.
	SmallVector<int, 8> SplitHi(InNumElements, -1);
	for (unsigned i = 0; i != OutNumElements; ++i)
	SplitHi[i] = i + OutNumElements;
	InHi = DAG.getVectorShuffle(InLoVT, dl, InLo, DAG.getUNDEF(InLoVT), SplitHi);

	Lo = DAG.getNode(Opcode, dl, OutLoVT, InLo);
	Hi = DAG.getNode(Opcode, dl, OutHiVT, InHi);
	}

	void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
	SDValue &Hi) {
	SDValue Vec = N->getOperand(0);
	SDValue Elt = N->getOperand(1);
	SDValue Idx = N->getOperand(2);
	SDLoc dl(N);
	GetSplitVector(Vec, Lo, Hi);

	if (ConstantSDNode *CIdx = dyn_cast<ConstantSDNode>(Idx)) {
	unsigned IdxVal = CIdx->getZExtValue();
	unsigned LoNumElts = Lo.getValueType().getVectorNumElements();
	if (IdxVal < LoNumElts)
	Lo = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
	Lo.getValueType(), Lo, Elt, Idx);
	else
	Hi =
	DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Hi.getValueType(), Hi, Elt,
	DAG.getConstant(IdxVal - LoNumElts, dl,
	TLI.getVectorIdxTy(DAG.getDataLayout())));
	return;
	}

	// See if the target wants to custom expand this node.
	if (CustomLowerNode(N, N->getValueType(0), true))
	return;

	// Spill the vector to the stack.
	EVT VecVT = Vec.getValueType();
	EVT EltVT = VecVT.getVectorElementType();
	SDValue StackPtr = DAG.CreateStackTemporary(VecVT);
	SDValue Store =
	DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, MachinePointerInfo());

	// Store the new element. This may be larger than the vector element type,
	// so use a truncating store.
	SDValue EltPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
	Type VecType = VecVT.getTypeForEVT(DAG.getContext());
	unsigned Alignment = DAG.getDataLayout().getPrefTypeAlignment(VecType);
	Store =
	DAG.getTruncStore(Store, dl, Elt, EltPtr, MachinePointerInfo(), EltVT);

	// Load the Lo part from the stack slot.
	Lo =
	DAG.getLoad(Lo.getValueType(), dl, Store, StackPtr, MachinePointerInfo());

	// Increment the pointer to the other part.
	unsigned IncrementSize = Lo.getValueSizeInBits() / 8;
	StackPtr = DAG.getNode(ISD::ADD, dl, StackPtr.getValueType(), StackPtr,
	DAG.getConstant(IncrementSize, dl,
	StackPtr.getValueType()));

	// Load the Hi part from the stack slot.
	Hi = DAG.getLoad(Hi.getValueType(), dl, Store, StackPtr, MachinePointerInfo(),
	MinAlign(Alignment, IncrementSize));
	}

	void DAGTypeLegalizer::SplitVecRes_SCALAR_TO_VECTOR(SDNode *N, SDValue &Lo,
	SDValue &Hi) {
	EVT LoVT, HiVT;
	SDLoc dl(N);
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
	Lo = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoVT, N->getOperand(0));
	Hi = DAG.getUNDEF(HiVT);
	}

	void DAGTypeLegalizer::SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo,
	SDValue &Hi) {
	assert(ISD::isUNINDEXEDLoad(LD) && "Indexed load during type legalization!");
	EVT LoVT, HiVT;
	SDLoc dl(LD);
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(LD->getValueType(0));

	ISD::LoadExtType ExtType = LD->getExtensionType();
	SDValue Ch = LD->getChain();
	SDValue Ptr = LD->getBasePtr();
	SDValue Offset = DAG.getUNDEF(Ptr.getValueType());
	EVT MemoryVT = LD->getMemoryVT();
	unsigned Alignment = LD->getOriginalAlignment();
	MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
	AAMDNodes AAInfo = LD->getAAInfo();

	EVT LoMemVT, HiMemVT;
	std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);

	Lo = DAG.getLoad(ISD::UNINDEXED, ExtType, LoVT, dl, Ch, Ptr, Offset,
	LD->getPointerInfo(), LoMemVT, Alignment, MMOFlags, AAInfo);

	unsigned IncrementSize = LoMemVT.getSizeInBits()/8;
	Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
	DAG.getConstant(IncrementSize, dl, Ptr.getValueType()));
	Hi = DAG.getLoad(ISD::UNINDEXED, ExtType, HiVT, dl, Ch, Ptr, Offset,
	LD->getPointerInfo().getWithOffset(IncrementSize), HiMemVT,
	Alignment, MMOFlags, AAInfo);

	// Build a factor node to remember that this load is independent of the
	// other one.
	Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
	Hi.getValue(1));

	// Legalize the chain result - switch anything that used the old chain to
	// use the new one.
	ReplaceValueWith(SDValue(LD, 1), Ch);
	}

	void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD,
	SDValue &Lo, SDValue &Hi) {
	EVT LoVT, HiVT;
	SDLoc dl(MLD);
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MLD->getValueType(0));

	SDValue Ch = MLD->getChain();
	SDValue Ptr = MLD->getBasePtr();
	SDValue Mask = MLD->getMask();
	SDValue Src0 = MLD->getSrc0();
	unsigned Alignment = MLD->getOriginalAlignment();
	ISD::LoadExtType ExtType = MLD->getExtensionType();

	// if Alignment is equal to the vector size,
	// take the half of it for the second part
	unsigned SecondHalfAlignment =
	(Alignment == MLD->getValueType(0).getSizeInBits()/8) ?
	Alignment/2 : Alignment;

	// Split Mask operand
	SDValue MaskLo, MaskHi;
	if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
	GetSplitVector(Mask, MaskLo, MaskHi);
	else
	std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);

	EVT MemoryVT = MLD->getMemoryVT();
	EVT LoMemVT, HiMemVT;
	std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);

	SDValue Src0Lo, Src0Hi;
	if (getTypeAction(Src0.getValueType()) == TargetLowering::TypeSplitVector)
	GetSplitVector(Src0, Src0Lo, Src0Hi);
	else
	std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(Src0, dl);

	MachineMemOperand *MMO = DAG.getMachineFunction().
	getMachineMemOperand(MLD->getPointerInfo(),
	MachineMemOperand::MOLoad, LoMemVT.getStoreSize(),
	Alignment, MLD->getAAInfo(), MLD->getRanges());

	Lo = DAG.getMaskedLoad(LoVT, dl, Ch, Ptr, MaskLo, Src0Lo, LoMemVT, MMO,
	ExtType, MLD->isExpandingLoad());

	Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, dl, LoMemVT, DAG,
	MLD->isExpandingLoad());

	MMO = DAG.getMachineFunction().
	getMachineMemOperand(MLD->getPointerInfo(),
	MachineMemOperand::MOLoad, HiMemVT.getStoreSize(),
	SecondHalfAlignment, MLD->getAAInfo(), MLD->getRanges());

	Hi = DAG.getMaskedLoad(HiVT, dl, Ch, Ptr, MaskHi, Src0Hi, HiMemVT, MMO,
	ExtType, MLD->isExpandingLoad());


	// Build a factor node to remember that this load is independent of the
	// other one.
	Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
	Hi.getValue(1));

	// Legalize the chain result - switch anything that used the old chain to
	// use the new one.
	ReplaceValueWith(SDValue(MLD, 1), Ch);

	}

	void DAGTypeLegalizer::SplitVecRes_MGATHER(MaskedGatherSDNode *MGT,
	SDValue &Lo, SDValue &Hi) {
	EVT LoVT, HiVT;
	SDLoc dl(MGT);
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MGT->getValueType(0));

	SDValue Ch = MGT->getChain();
	SDValue Ptr = MGT->getBasePtr();
	SDValue Mask = MGT->getMask();
	SDValue Src0 = MGT->getValue();
	SDValue Index = MGT->getIndex();
	unsigned Alignment = MGT->getOriginalAlignment();

	// Split Mask operand
	SDValue MaskLo, MaskHi;
	if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
	GetSplitVector(Mask, MaskLo, MaskHi);
	else
	std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);

	EVT MemoryVT = MGT->getMemoryVT();
	EVT LoMemVT, HiMemVT;
	// Split MemoryVT
	std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);

	SDValue Src0Lo, Src0Hi;
	if (getTypeAction(Src0.getValueType()) == TargetLowering::TypeSplitVector)
	GetSplitVector(Src0, Src0Lo, Src0Hi);
	else
	std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(Src0, dl);

	SDValue IndexHi, IndexLo;
	if (getTypeAction(Index.getValueType()) == TargetLowering::TypeSplitVector)
	GetSplitVector(Index, IndexLo, IndexHi);
	else
	std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, dl);

	MachineMemOperand *MMO = DAG.getMachineFunction().
	getMachineMemOperand(MGT->getPointerInfo(),
	MachineMemOperand::MOLoad, LoMemVT.getStoreSize(),
	Alignment, MGT->getAAInfo(), MGT->getRanges());

	SDValue OpsLo[] = {Ch, Src0Lo, MaskLo, Ptr, IndexLo};
	Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoVT, dl, OpsLo,
	MMO);

	SDValue OpsHi[] = {Ch, Src0Hi, MaskHi, Ptr, IndexHi};
	Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiVT, dl, OpsHi,
	MMO);

	// Build a factor node to remember that this load is independent of the
	// other one.
	Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
	Hi.getValue(1));

	// Legalize the chain result - switch anything that used the old chain to
	// use the new one.
	ReplaceValueWith(SDValue(MGT, 1), Ch);
	}


	void DAGTypeLegalizer::SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi) {
	assert(N->getValueType(0).isVector() &&
	N->getOperand(0).getValueType().isVector() &&
	"Operand types must be vectors");

	EVT LoVT, HiVT;
	SDLoc DL(N);
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));

	// Split the input.
	SDValue LL, LH, RL, RH;
	std::tie(LL, LH) = DAG.SplitVectorOperand(N, 0);
	std::tie(RL, RH) = DAG.SplitVectorOperand(N, 1);

	Lo = DAG.getNode(N->getOpcode(), DL, LoVT, LL, RL, N->getOperand(2));
	Hi = DAG.getNode(N->getOpcode(), DL, HiVT, LH, RH, N->getOperand(2));
	}

	void DAGTypeLegalizer::SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo,
	SDValue &Hi) {
	// Get the dest types - they may not match the input types, e.g. int_to_fp.
	EVT LoVT, HiVT;
	SDLoc dl(N);
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));

	// If the input also splits, handle it directly for a compile time speedup.
	// Otherwise split it by hand.
	EVT InVT = N->getOperand(0).getValueType();
	if (getTypeAction(InVT) == TargetLowering::TypeSplitVector)
	GetSplitVector(N->getOperand(0), Lo, Hi);
	else
	std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);

	if (N->getOpcode() == ISD::FP_ROUND) {
	Lo = DAG.getNode(N->getOpcode(), dl, LoVT, Lo, N->getOperand(1));
	Hi = DAG.getNode(N->getOpcode(), dl, HiVT, Hi, N->getOperand(1));
	} else {
	Lo = DAG.getNode(N->getOpcode(), dl, LoVT, Lo);
	Hi = DAG.getNode(N->getOpcode(), dl, HiVT, Hi);
	}
	}

	void DAGTypeLegalizer::SplitVecRes_ExtendOp(SDNode *N, SDValue &Lo,
	SDValue &Hi) {
	SDLoc dl(N);
	EVT SrcVT = N->getOperand(0).getValueType();
	EVT DestVT = N->getValueType(0);
	EVT LoVT, HiVT;
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(DestVT);

	// We can do better than a generic split operation if the extend is doing
	// more than just doubling the width of the elements and the following are
	// true:
	// - The number of vector elements is even,
	// - the source type is legal,
	// - the type of a split source is illegal,
	// - the type of an extended (by doubling element size) source is legal, and
	// - the type of that extended source when split is legal.
	//
	// This won't necessarily completely legalize the operation, but it will
	// more effectively move in the right direction and prevent falling down
	// to scalarization in many cases due to the input vector being split too
	// far.
	unsigned NumElements = SrcVT.getVectorNumElements();
	if ((NumElements & 1) == 0 &&
	SrcVT.getSizeInBits() * 2 < DestVT.getSizeInBits()) {
	LLVMContext &Ctx = *DAG.getContext();
	EVT NewSrcVT = SrcVT.widenIntegerVectorElementType(Ctx);
	EVT SplitSrcVT = SrcVT.getHalfNumVectorElementsVT(Ctx);

	EVT SplitLoVT, SplitHiVT;
	std::tie(SplitLoVT, SplitHiVT) = DAG.GetSplitDestVTs(NewSrcVT);
	if (TLI.isTypeLegal(SrcVT) && !TLI.isTypeLegal(SplitSrcVT) &&
	TLI.isTypeLegal(NewSrcVT) && TLI.isTypeLegal(SplitLoVT)) {
	DEBUG(dbgs() << "Split vector extend via incremental extend:";
	N->dump(&DAG); dbgs() << "\n");
	// Extend the source vector by one step.
	SDValue NewSrc =
	DAG.getNode(N->getOpcode(), dl, NewSrcVT, N->getOperand(0));
	// Get the low and high halves of the new, extended one step, vector.
	std::tie(Lo, Hi) = DAG.SplitVector(NewSrc, dl);
	// Extend those vector halves the rest of the way.
	Lo = DAG.getNode(N->getOpcode(), dl, LoVT, Lo);
	Hi = DAG.getNode(N->getOpcode(), dl, HiVT, Hi);
	return;
	}
	}
	// Fall back to the generic unary operator splitting otherwise.
	SplitVecRes_UnaryOp(N, Lo, Hi);
	}

	void DAGTypeLegalizer::SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N,
	SDValue &Lo, SDValue &Hi) {
	// The low and high parts of the original input give four input vectors.
	SDValue Inputs[4];
	SDLoc dl(N);
	GetSplitVector(N->getOperand(0), Inputs[0], Inputs[1]);
	GetSplitVector(N->getOperand(1), Inputs[2], Inputs[3]);
	EVT NewVT = Inputs[0].getValueType();
	unsigned NewElts = NewVT.getVectorNumElements();

	// If Lo or Hi uses elements from at most two of the four input vectors, then
	// express it as a vector shuffle of those two inputs. Otherwise extract the
	// input elements by hand and construct the Lo/Hi output using a BUILD_VECTOR.
	SmallVector<int, 16> Ops;
	for (unsigned High = 0; High < 2; ++High) {
	SDValue &Output = High ? Hi : Lo;

	// Build a shuffle mask for the output, discovering on the fly which
	// input vectors to use as shuffle operands (recorded in InputUsed).
	// If building a suitable shuffle vector proves too hard, then bail
	// out with useBuildVector set.
	unsigned InputUsed[2] = { -1U, -1U }; // Not yet discovered.
	unsigned FirstMaskIdx = High * NewElts;
	bool useBuildVector = false;
	for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
	// The mask element. This indexes into the input.
	int Idx = N->getMaskElt(FirstMaskIdx + MaskOffset);

	// The input vector this mask element indexes into.
	unsigned Input = (unsigned)Idx / NewElts;

	if (Input >= array_lengthof(Inputs)) {
	// The mask element does not index into any input vector.
	Ops.push_back(-1);
	continue;
	}

	// Turn the index into an offset from the start of the input vector.
	Idx -= Input * NewElts;

	// Find or create a shuffle vector operand to hold this input.
	unsigned OpNo;
	for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {
	if (InputUsed[OpNo] == Input) {
	// This input vector is already an operand.
	break;
	} else if (InputUsed[OpNo] == -1U) {
	// Create a new operand for this input vector.
	InputUsed[OpNo] = Input;
	break;
	}
	}

	if (OpNo >= array_lengthof(InputUsed)) {
	// More than two input vectors used! Give up on trying to create a
	// shuffle vector. Insert all elements into a BUILD_VECTOR instead.
	useBuildVector = true;
	break;
	}

	// Add the mask index for the new shuffle vector.
	Ops.push_back(Idx + OpNo * NewElts);
	}

	if (useBuildVector) {
	EVT EltVT = NewVT.getVectorElementType();
	SmallVector<SDValue, 16> SVOps;

	// Extract the input elements by hand.
	for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
	// The mask element. This indexes into the input.
	int Idx = N->getMaskElt(FirstMaskIdx + MaskOffset);

	// The input vector this mask element indexes into.
	unsigned Input = (unsigned)Idx / NewElts;

	if (Input >= array_lengthof(Inputs)) {
	// The mask element is "undef" or indexes off the end of the input.
	SVOps.push_back(DAG.getUNDEF(EltVT));
	continue;
	}

	// Turn the index into an offset from the start of the input vector.
	Idx -= Input * NewElts;

	// Extract the vector element by hand.
	SVOps.push_back(DAG.getNode(
	ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Inputs[Input],
	DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout()))));
	}

	// Construct the Lo/Hi output using a BUILD_VECTOR.
	Output = DAG.getBuildVector(NewVT, dl, SVOps);
	} else if (InputUsed[0] == -1U) {
	// No input vectors were used! The result is undefined.
	Output = DAG.getUNDEF(NewVT);
	} else {
	SDValue Op0 = Inputs[InputUsed[0]];
	// If only one input was used, use an undefined vector for the other.
	SDValue Op1 = InputUsed[1] == -1U ?
	DAG.getUNDEF(NewVT) : Inputs[InputUsed[1]];
	// At least one input vector was used. Create a new shuffle vector.
	Output = DAG.getVectorShuffle(NewVT, dl, Op0, Op1, Ops);
	}

	Ops.clear();
	}
	}


	//===----------------------------------------------------------------------===//
	// Operand Vector Splitting
	//===----------------------------------------------------------------------===//

	/// This method is called when the specified operand of the specified node is
	/// found to need vector splitting. At this point, all of the result types of
	/// the node are known to be legal, but other operands of the node may need
	/// legalization as well as the specified one.
	bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
	DEBUG(dbgs() << "Split node operand: ";
	N->dump(&DAG);
	dbgs() << "\n");
	SDValue Res = SDValue();

	// See if the target wants to custom split this node.
	if (CustomLowerNode(N, N->getOperand(OpNo).getValueType(), false))
	return false;

	if (!Res.getNode()) {
	switch (N->getOpcode()) {
	default:
	#ifndef NDEBUG
	dbgs() << "SplitVectorOperand Op #" << OpNo << ": ";
	N->dump(&DAG);
	dbgs() << "\n";
	#endif
	report_fatal_error("Do not know how to split this operator's "
	"operand!\n");

	case ISD::SETCC: Res = SplitVecOp_VSETCC(N); break;
	case ISD::BITCAST: Res = SplitVecOp_BITCAST(N); break;
	case ISD::EXTRACT_SUBVECTOR: Res = SplitVecOp_EXTRACT_SUBVECTOR(N); break;
	case ISD::EXTRACT_VECTOR_ELT:Res = SplitVecOp_EXTRACT_VECTOR_ELT(N); break;
	case ISD::CONCAT_VECTORS: Res = SplitVecOp_CONCAT_VECTORS(N); break;
	case ISD::TRUNCATE:
	Res = SplitVecOp_TruncateHelper(N);
	break;
	case ISD::FP_ROUND: Res = SplitVecOp_FP_ROUND(N); break;
	case ISD::FCOPYSIGN: Res = SplitVecOp_FCOPYSIGN(N); break;
	case ISD::STORE:
	Res = SplitVecOp_STORE(cast<StoreSDNode>(N), OpNo);
	break;
	case ISD::MSTORE:
	Res = SplitVecOp_MSTORE(cast<MaskedStoreSDNode>(N), OpNo);
	break;
	case ISD::MSCATTER:
	Res = SplitVecOp_MSCATTER(cast<MaskedScatterSDNode>(N), OpNo);
	break;
	case ISD::MGATHER:
	Res = SplitVecOp_MGATHER(cast<MaskedGatherSDNode>(N), OpNo);
	break;
	case ISD::VSELECT:
	Res = SplitVecOp_VSELECT(N, OpNo);
	break;
	case ISD::FP_TO_SINT:
	case ISD::FP_TO_UINT:
	if (N->getValueType(0).bitsLT(N->getOperand(0)->getValueType(0)))
	Res = SplitVecOp_TruncateHelper(N);
	else
	Res = SplitVecOp_UnaryOp(N);
	break;
	case ISD::SINT_TO_FP:
	case ISD::UINT_TO_FP:
	if (N->getValueType(0).bitsLT(N->getOperand(0)->getValueType(0)))
	Res = SplitVecOp_TruncateHelper(N);
	else
	Res = SplitVecOp_UnaryOp(N);
	break;
	case ISD::CTTZ:
	case ISD::CTLZ:
	case ISD::CTPOP:
	case ISD::FP_EXTEND:
	case ISD::SIGN_EXTEND:
	case ISD::ZERO_EXTEND:
	case ISD::ANY_EXTEND:
	case ISD::FTRUNC:
	case ISD::FCANONICALIZE:
	Res = SplitVecOp_UnaryOp(N);
	break;

	case ISD::ANY_EXTEND_VECTOR_INREG:
	case ISD::SIGN_EXTEND_VECTOR_INREG:
	case ISD::ZERO_EXTEND_VECTOR_INREG:
	Res = SplitVecOp_ExtVecInRegOp(N);
	break;

	case ISD::VECREDUCE_FADD:
	case ISD::VECREDUCE_FMUL:
	case ISD::VECREDUCE_ADD:
	case ISD::VECREDUCE_MUL:
	case ISD::VECREDUCE_AND:
	case ISD::VECREDUCE_OR:
	case ISD::VECREDUCE_XOR:
	case ISD::VECREDUCE_SMAX:
	case ISD::VECREDUCE_SMIN:
	case ISD::VECREDUCE_UMAX:
	case ISD::VECREDUCE_UMIN:
	case ISD::VECREDUCE_FMAX:
	case ISD::VECREDUCE_FMIN:
	Res = SplitVecOp_VECREDUCE(N, OpNo);
	break;
	}
	}

	// If the result is null, the sub-method took care of registering results etc.
	if (!Res.getNode()) return false;

	// If the result is N, the sub-method updated N in place. Tell the legalizer
	// core about this.
	if (Res.getNode() == N)
	return true;

	assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 &&
	"Invalid operand expansion");

	ReplaceValueWith(SDValue(N, 0), Res);
	return false;
	}

	SDValue DAGTypeLegalizer::SplitVecOp_VSELECT(SDNode *N, unsigned OpNo) {
	// The only possibility for an illegal operand is the mask, since result type
	// legalization would have handled this node already otherwise.
	assert(OpNo == 0 && "Illegal operand must be mask");

	SDValue Mask = N->getOperand(0);
	SDValue Src0 = N->getOperand(1);
	SDValue Src1 = N->getOperand(2);
	EVT Src0VT = Src0.getValueType();
	SDLoc DL(N);
	assert(Mask.getValueType().isVector() && "VSELECT without a vector mask?");

	SDValue Lo, Hi;
	GetSplitVector(N->getOperand(0), Lo, Hi);
	assert(Lo.getValueType() == Hi.getValueType() &&
	"Lo and Hi have differing types");

	EVT LoOpVT, HiOpVT;
	std::tie(LoOpVT, HiOpVT) = DAG.GetSplitDestVTs(Src0VT);
	assert(LoOpVT == HiOpVT && "Asymmetric vector split?");

	SDValue LoOp0, HiOp0, LoOp1, HiOp1, LoMask, HiMask;
	std::tie(LoOp0, HiOp0) = DAG.SplitVector(Src0, DL);
	std::tie(LoOp1, HiOp1) = DAG.SplitVector(Src1, DL);
	std::tie(LoMask, HiMask) = DAG.SplitVector(Mask, DL);

	SDValue LoSelect =
	DAG.getNode(ISD::VSELECT, DL, LoOpVT, LoMask, LoOp0, LoOp1);
	SDValue HiSelect =
	DAG.getNode(ISD::VSELECT, DL, HiOpVT, HiMask, HiOp0, HiOp1);

	return DAG.getNode(ISD::CONCAT_VECTORS, DL, Src0VT, LoSelect, HiSelect);
	}

	SDValue DAGTypeLegalizer::SplitVecOp_VECREDUCE(SDNode *N, unsigned OpNo) {
	EVT ResVT = N->getValueType(0);
	SDValue Lo, Hi;
	SDLoc dl(N);

	SDValue VecOp = N->getOperand(OpNo);
	EVT VecVT = VecOp.getValueType();
	assert(VecVT.isVector() && "Can only split reduce vector operand");
	GetSplitVector(VecOp, Lo, Hi);
	EVT LoOpVT, HiOpVT;
	std::tie(LoOpVT, HiOpVT) = DAG.GetSplitDestVTs(VecVT);

	bool NoNaN = N->getFlags().hasNoNaNs();
	unsigned CombineOpc = 0;
	switch (N->getOpcode()) {
	case ISD::VECREDUCE_FADD: CombineOpc = ISD::FADD; break;
	case ISD::VECREDUCE_FMUL: CombineOpc = ISD::FMUL; break;
	case ISD::VECREDUCE_ADD: CombineOpc = ISD::ADD; break;
	case ISD::VECREDUCE_MUL: CombineOpc = ISD::MUL; break;
	case ISD::VECREDUCE_AND: CombineOpc = ISD::AND; break;
	case ISD::VECREDUCE_OR: CombineOpc = ISD::OR; break;
	case ISD::VECREDUCE_XOR: CombineOpc = ISD::XOR; break;
	case ISD::VECREDUCE_SMAX: CombineOpc = ISD::SMAX; break;
	case ISD::VECREDUCE_SMIN: CombineOpc = ISD::SMIN; break;
	case ISD::VECREDUCE_UMAX: CombineOpc = ISD::UMAX; break;
	case ISD::VECREDUCE_UMIN: CombineOpc = ISD::UMIN; break;
	case ISD::VECREDUCE_FMAX:
	CombineOpc = NoNaN ? ISD::FMAXNUM : ISD::FMAXNAN;
	break;
	case ISD::VECREDUCE_FMIN:
	CombineOpc = NoNaN ? ISD::FMINNUM : ISD::FMINNAN;
	break;
	default:
	llvm_unreachable("Unexpected reduce ISD node");
	}

	// Use the appropriate scalar instruction on the split subvectors before
	// reducing the now partially reduced smaller vector.
	SDValue Partial = DAG.getNode(CombineOpc, dl, LoOpVT, Lo, Hi);
	return DAG.getNode(N->getOpcode(), dl, ResVT, Partial);
	}

	SDValue DAGTypeLegalizer::SplitVecOp_UnaryOp(SDNode *N) {
	// The result has a legal vector type, but the input needs splitting.
	EVT ResVT = N->getValueType(0);
	SDValue Lo, Hi;
	SDLoc dl(N);
	GetSplitVector(N->getOperand(0), Lo, Hi);
	EVT InVT = Lo.getValueType();

	EVT OutVT = EVT::getVectorVT(*DAG.getContext(), ResVT.getVectorElementType(),
	InVT.getVectorNumElements());

	Lo = DAG.getNode(N->getOpcode(), dl, OutVT, Lo);
	Hi = DAG.getNode(N->getOpcode(), dl, OutVT, Hi);

	return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
	}

	SDValue DAGTypeLegalizer::SplitVecOp_BITCAST(SDNode *N) {
	// For example, i64 = BITCAST v4i16 on alpha. Typically the vector will
	// end up being split all the way down to individual components. Convert the
	// split pieces into integers and reassemble.
	SDValue Lo, Hi;
	GetSplitVector(N->getOperand(0), Lo, Hi);
	Lo = BitConvertToInteger(Lo);
	Hi = BitConvertToInteger(Hi);

	if (DAG.getDataLayout().isBigEndian())
	std::swap(Lo, Hi);

	return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getValueType(0),
	JoinIntegers(Lo, Hi));
	}

	SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_SUBVECTOR(SDNode *N) {
	// We know that the extracted result type is legal.
	EVT SubVT = N->getValueType(0);
	SDValue Idx = N->getOperand(1);
	SDLoc dl(N);
	SDValue Lo, Hi;
	GetSplitVector(N->getOperand(0), Lo, Hi);

	uint64_t LoElts = Lo.getValueType().getVectorNumElements();
	uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();

	if (IdxVal < LoElts) {
	assert(IdxVal + SubVT.getVectorNumElements() <= LoElts &&
	"Extracted subvector crosses vector split!");
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, Lo, Idx);
	} else {
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, Hi,
	DAG.getConstant(IdxVal - LoElts, dl,
	Idx.getValueType()));
	}
	}

	SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
	SDValue Vec = N->getOperand(0);
	SDValue Idx = N->getOperand(1);
	EVT VecVT = Vec.getValueType();

	if (isa<ConstantSDNode>(Idx)) {
	uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
	assert(IdxVal < VecVT.getVectorNumElements() && "Invalid vector index!");

	SDValue Lo, Hi;
	GetSplitVector(Vec, Lo, Hi);

	uint64_t LoElts = Lo.getValueType().getVectorNumElements();

	if (IdxVal < LoElts)
	return SDValue(DAG.UpdateNodeOperands(N, Lo, Idx), 0);
	return SDValue(DAG.UpdateNodeOperands(N, Hi,
	DAG.getConstant(IdxVal - LoElts, SDLoc(N),
	Idx.getValueType())), 0);
	}

	// See if the target wants to custom expand this node.
	if (CustomLowerNode(N, N->getValueType(0), true))
	return SDValue();

	// Make the vector elements byte-addressable if they aren't already.
	SDLoc dl(N);
	EVT EltVT = VecVT.getVectorElementType();
	if (EltVT.getSizeInBits() < 8) {
	SmallVector<SDValue, 4> ElementOps;
	for (unsigned i = 0; i < VecVT.getVectorNumElements(); ++i) {
	ElementOps.push_back(DAG.getAnyExtOrTrunc(
	DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vec,
	DAG.getConstant(i, dl, MVT::i8)),
	dl, MVT::i8));
	}

	EltVT = MVT::i8;
	VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
	VecVT.getVectorNumElements());
	Vec = DAG.getBuildVector(VecVT, dl, ElementOps);
	}

	// Store the vector to the stack.
	SDValue StackPtr = DAG.CreateStackTemporary(VecVT);
	SDValue Store =
	DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, MachinePointerInfo());

	// Load back the required element.
	StackPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
	return DAG.getExtLoad(ISD::EXTLOAD, dl, N->getValueType(0), Store, StackPtr,
	MachinePointerInfo(), EltVT);
	}

	SDValue DAGTypeLegalizer::SplitVecOp_ExtVecInRegOp(SDNode *N) {
	SDValue Lo, Hi;

	// *_EXTEND_VECTOR_INREG only reference the lower half of the input, so
	// splitting the result has the same effect as splitting the input operand.
	SplitVecRes_ExtVecInRegOp(N, Lo, Hi);

	return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), N->getValueType(0), Lo, Hi);
	}

	SDValue DAGTypeLegalizer::SplitVecOp_MGATHER(MaskedGatherSDNode *MGT,
	unsigned OpNo) {
	EVT LoVT, HiVT;
	SDLoc dl(MGT);
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MGT->getValueType(0));

	SDValue Ch = MGT->getChain();
	SDValue Ptr = MGT->getBasePtr();
	SDValue Index = MGT->getIndex();
	SDValue Mask = MGT->getMask();
	SDValue Src0 = MGT->getValue();
	unsigned Alignment = MGT->getOriginalAlignment();

	SDValue MaskLo, MaskHi;
	if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
	// Split Mask operand
	GetSplitVector(Mask, MaskLo, MaskHi);
	else
	std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);

	EVT MemoryVT = MGT->getMemoryVT();
	EVT LoMemVT, HiMemVT;
	std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);

	SDValue Src0Lo, Src0Hi;
	if (getTypeAction(Src0.getValueType()) == TargetLowering::TypeSplitVector)
	GetSplitVector(Src0, Src0Lo, Src0Hi);
	else
	std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(Src0, dl);

	SDValue IndexHi, IndexLo;
	if (getTypeAction(Index.getValueType()) == TargetLowering::TypeSplitVector)
	GetSplitVector(Index, IndexLo, IndexHi);
	else
	std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, dl);

	MachineMemOperand *MMO = DAG.getMachineFunction().
	getMachineMemOperand(MGT->getPointerInfo(),
	MachineMemOperand::MOLoad, LoMemVT.getStoreSize(),
	Alignment, MGT->getAAInfo(), MGT->getRanges());

	SDValue OpsLo[] = {Ch, Src0Lo, MaskLo, Ptr, IndexLo};
	SDValue Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoVT, dl,
	OpsLo, MMO);

	MMO = DAG.getMachineFunction().
	getMachineMemOperand(MGT->getPointerInfo(),
	MachineMemOperand::MOLoad, HiMemVT.getStoreSize(),
	Alignment, MGT->getAAInfo(),
	MGT->getRanges());

	SDValue OpsHi[] = {Ch, Src0Hi, MaskHi, Ptr, IndexHi};
	SDValue Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiVT, dl,
	OpsHi, MMO);

	// Build a factor node to remember that this load is independent of the
	// other one.
	Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
	Hi.getValue(1));

	// Legalize the chain result - switch anything that used the old chain to
	// use the new one.
	ReplaceValueWith(SDValue(MGT, 1), Ch);

	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MGT->getValueType(0), Lo,
	Hi);
	ReplaceValueWith(SDValue(MGT, 0), Res);
	return SDValue();
	}

	SDValue DAGTypeLegalizer::SplitVecOp_MSTORE(MaskedStoreSDNode *N,
	unsigned OpNo) {
	SDValue Ch = N->getChain();
	SDValue Ptr = N->getBasePtr();
	SDValue Mask = N->getMask();
	SDValue Data = N->getValue();
	EVT MemoryVT = N->getMemoryVT();
	unsigned Alignment = N->getOriginalAlignment();
	SDLoc DL(N);

	EVT LoMemVT, HiMemVT;
	std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);

	SDValue DataLo, DataHi;
	if (getTypeAction(Data.getValueType()) == TargetLowering::TypeSplitVector)
	// Split Data operand
	GetSplitVector(Data, DataLo, DataHi);
	else
	std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL);

	SDValue MaskLo, MaskHi;
	if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
	// Split Mask operand
	GetSplitVector(Mask, MaskLo, MaskHi);
	else
	std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, DL);

	MaskLo = PromoteTargetBoolean(MaskLo, DataLo.getValueType());
	MaskHi = PromoteTargetBoolean(MaskHi, DataHi.getValueType());

	// if Alignment is equal to the vector size,
	// take the half of it for the second part
	unsigned SecondHalfAlignment =
	(Alignment == Data->getValueType(0).getSizeInBits()/8) ?
	Alignment/2 : Alignment;

	SDValue Lo, Hi;
	MachineMemOperand *MMO = DAG.getMachineFunction().
	getMachineMemOperand(N->getPointerInfo(),
	MachineMemOperand::MOStore, LoMemVT.getStoreSize(),
	Alignment, N->getAAInfo(), N->getRanges());

	Lo = DAG.getMaskedStore(Ch, DL, DataLo, Ptr, MaskLo, LoMemVT, MMO,
	N->isTruncatingStore(),
	N->isCompressingStore());

	Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, DL, LoMemVT, DAG,
	N->isCompressingStore());
	MMO = DAG.getMachineFunction().
	getMachineMemOperand(N->getPointerInfo(),
	MachineMemOperand::MOStore, HiMemVT.getStoreSize(),
	SecondHalfAlignment, N->getAAInfo(), N->getRanges());

	Hi = DAG.getMaskedStore(Ch, DL, DataHi, Ptr, MaskHi, HiMemVT, MMO,
	N->isTruncatingStore(), N->isCompressingStore());

	// Build a factor node to remember that this store is independent of the
	// other one.
	return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi);
	}

	SDValue DAGTypeLegalizer::SplitVecOp_MSCATTER(MaskedScatterSDNode *N,
	unsigned OpNo) {
	SDValue Ch = N->getChain();
	SDValue Ptr = N->getBasePtr();
	SDValue Mask = N->getMask();
	SDValue Index = N->getIndex();
	SDValue Data = N->getValue();
	EVT MemoryVT = N->getMemoryVT();
	unsigned Alignment = N->getOriginalAlignment();
	SDLoc DL(N);

	// Split all operands
	EVT LoMemVT, HiMemVT;
	std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);

	SDValue DataLo, DataHi;
	if (getTypeAction(Data.getValueType()) == TargetLowering::TypeSplitVector)
	// Split Data operand
	GetSplitVector(Data, DataLo, DataHi);
	else
	std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL);

	SDValue MaskLo, MaskHi;
	if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
	// Split Mask operand
	GetSplitVector(Mask, MaskLo, MaskHi);
	else
	std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, DL);

	SDValue IndexHi, IndexLo;
	if (getTypeAction(Index.getValueType()) == TargetLowering::TypeSplitVector)
	GetSplitVector(Index, IndexLo, IndexHi);
	else
	std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, DL);

	SDValue Lo, Hi;
	MachineMemOperand *MMO = DAG.getMachineFunction().
	getMachineMemOperand(N->getPointerInfo(),
	MachineMemOperand::MOStore, LoMemVT.getStoreSize(),
	Alignment, N->getAAInfo(), N->getRanges());

	SDValue OpsLo[] = {Ch, DataLo, MaskLo, Ptr, IndexLo};
	Lo = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), DataLo.getValueType(),
	DL, OpsLo, MMO);

	MMO = DAG.getMachineFunction().
	getMachineMemOperand(N->getPointerInfo(),
	MachineMemOperand::MOStore, HiMemVT.getStoreSize(),
	Alignment, N->getAAInfo(), N->getRanges());

	SDValue OpsHi[] = {Ch, DataHi, MaskHi, Ptr, IndexHi};
	Hi = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), DataHi.getValueType(),
	DL, OpsHi, MMO);

	// Build a factor node to remember that this store is independent of the
	// other one.
	return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi);
	}

	SDValue DAGTypeLegalizer::SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo) {
	assert(N->isUnindexed() && "Indexed store of vector?");
	assert(OpNo == 1 && "Can only split the stored value");
	SDLoc DL(N);

	bool isTruncating = N->isTruncatingStore();
	SDValue Ch = N->getChain();
	SDValue Ptr = N->getBasePtr();
	EVT MemoryVT = N->getMemoryVT();
	unsigned Alignment = N->getOriginalAlignment();
	MachineMemOperand::Flags MMOFlags = N->getMemOperand()->getFlags();
	AAMDNodes AAInfo = N->getAAInfo();
	SDValue Lo, Hi;
	GetSplitVector(N->getOperand(1), Lo, Hi);

	EVT LoMemVT, HiMemVT;
	std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);

	unsigned IncrementSize = LoMemVT.getSizeInBits()/8;

	if (isTruncating)
	Lo = DAG.getTruncStore(Ch, DL, Lo, Ptr, N->getPointerInfo(), LoMemVT,
	Alignment, MMOFlags, AAInfo);
	else
	Lo = DAG.getStore(Ch, DL, Lo, Ptr, N->getPointerInfo(), Alignment, MMOFlags,
	AAInfo);

	// Increment the pointer to the other half.
	Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
	DAG.getConstant(IncrementSize, DL, Ptr.getValueType()));

	if (isTruncating)
	Hi = DAG.getTruncStore(Ch, DL, Hi, Ptr,
	N->getPointerInfo().getWithOffset(IncrementSize),
	HiMemVT, Alignment, MMOFlags, AAInfo);
	else
	Hi = DAG.getStore(Ch, DL, Hi, Ptr,
	N->getPointerInfo().getWithOffset(IncrementSize),
	Alignment, MMOFlags, AAInfo);

	return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi);
	}

	SDValue DAGTypeLegalizer::SplitVecOp_CONCAT_VECTORS(SDNode *N) {
	SDLoc DL(N);

	// The input operands all must have the same type, and we know the result
	// type is valid. Convert this to a buildvector which extracts all the
	// input elements.
	// TODO: If the input elements are power-two vectors, we could convert this to
	// a new CONCAT_VECTORS node with elements that are half-wide.
	SmallVector<SDValue, 32> Elts;
	EVT EltVT = N->getValueType(0).getVectorElementType();
	for (const SDValue &Op : N->op_values()) {
	for (unsigned i = 0, e = Op.getValueType().getVectorNumElements();
	i != e; ++i) {
	Elts.push_back(DAG.getNode(
	ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Op,
	DAG.getConstant(i, DL, TLI.getVectorIdxTy(DAG.getDataLayout()))));
	}
	}

	return DAG.getBuildVector(N->getValueType(0), DL, Elts);
	}

	SDValue DAGTypeLegalizer::SplitVecOp_TruncateHelper(SDNode *N) {
	// The result type is legal, but the input type is illegal. If splitting
	// ends up with the result type of each half still being legal, just
	// do that. If, however, that would result in an illegal result type,
	// we can try to get more clever with power-two vectors. Specifically,
	// split the input type, but also widen the result element size, then
	// concatenate the halves and truncate again. For example, consider a target
	// where v8i8 is legal and v8i32 is not (ARM, which doesn't have 256-bit
	// vectors). To perform a "%res = v8i8 trunc v8i32 %in" we do:
	// %inlo = v4i32 extract_subvector %in, 0
	// %inhi = v4i32 extract_subvector %in, 4
	// %lo16 = v4i16 trunc v4i32 %inlo
	// %hi16 = v4i16 trunc v4i32 %inhi
	// %in16 = v8i16 concat_vectors v4i16 %lo16, v4i16 %hi16
	// %res = v8i8 trunc v8i16 %in16
	//
	// Without this transform, the original truncate would end up being
	// scalarized, which is pretty much always a last resort.
	SDValue InVec = N->getOperand(0);
	EVT InVT = InVec->getValueType(0);
	EVT OutVT = N->getValueType(0);
	unsigned NumElements = OutVT.getVectorNumElements();
	bool IsFloat = OutVT.isFloatingPoint();

	// Widening should have already made sure this is a power-two vector
	// if we're trying to split it at all. assert() that's true, just in case.
	assert(!(NumElements & 1) && "Splitting vector, but not in half!");

	unsigned InElementSize = InVT.getScalarSizeInBits();
	unsigned OutElementSize = OutVT.getScalarSizeInBits();

	// If the input elements are only 1/2 the width of the result elements,
	// just use the normal splitting. Our trick only work if there's room
	// to split more than once.
	if (InElementSize <= OutElementSize * 2)
	return SplitVecOp_UnaryOp(N);
	SDLoc DL(N);

	// Extract the halves of the input via extract_subvector.
	SDValue InLoVec, InHiVec;
	std::tie(InLoVec, InHiVec) = DAG.SplitVector(InVec, DL);
	// Truncate them to 1/2 the element size.
	EVT HalfElementVT = IsFloat ?
	EVT::getFloatingPointVT(InElementSize/2) :
	EVT::getIntegerVT(*DAG.getContext(), InElementSize/2);
	EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), HalfElementVT,
	NumElements/2);
	SDValue HalfLo = DAG.getNode(N->getOpcode(), DL, HalfVT, InLoVec);
	SDValue HalfHi = DAG.getNode(N->getOpcode(), DL, HalfVT, InHiVec);
	// Concatenate them to get the full intermediate truncation result.
	EVT InterVT = EVT::getVectorVT(*DAG.getContext(), HalfElementVT, NumElements);
	SDValue InterVec = DAG.getNode(ISD::CONCAT_VECTORS, DL, InterVT, HalfLo,
	HalfHi);
	// Now finish up by truncating all the way down to the original result
	// type. This should normally be something that ends up being legal directly,
	// but in theory if a target has very wide vectors and an annoyingly
	// restricted set of legal types, this split can chain to build things up.
	return IsFloat
	? DAG.getNode(ISD::FP_ROUND, DL, OutVT, InterVec,
	DAG.getTargetConstant(
	0, DL, TLI.getPointerTy(DAG.getDataLayout())))
	: DAG.getNode(ISD::TRUNCATE, DL, OutVT, InterVec);
	}

	SDValue DAGTypeLegalizer::SplitVecOp_VSETCC(SDNode *N) {
	assert(N->getValueType(0).isVector() &&
	N->getOperand(0).getValueType().isVector() &&
	"Operand types must be vectors");
	// The result has a legal vector type, but the input needs splitting.
	SDValue Lo0, Hi0, Lo1, Hi1, LoRes, HiRes;
	SDLoc DL(N);
	GetSplitVector(N->getOperand(0), Lo0, Hi0);
	GetSplitVector(N->getOperand(1), Lo1, Hi1);
	unsigned PartElements = Lo0.getValueType().getVectorNumElements();
	EVT PartResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, PartElements);
	EVT WideResVT = EVT::getVectorVT(DAG.getContext(), MVT::i1, 2PartElements);

	LoRes = DAG.getNode(ISD::SETCC, DL, PartResVT, Lo0, Lo1, N->getOperand(2));
	HiRes = DAG.getNode(ISD::SETCC, DL, PartResVT, Hi0, Hi1, N->getOperand(2));
	SDValue Con = DAG.getNode(ISD::CONCAT_VECTORS, DL, WideResVT, LoRes, HiRes);
	return PromoteTargetBoolean(Con, N->getValueType(0));
	}


	SDValue DAGTypeLegalizer::SplitVecOp_FP_ROUND(SDNode *N) {
	// The result has a legal vector type, but the input needs splitting.
	EVT ResVT = N->getValueType(0);
	SDValue Lo, Hi;
	SDLoc DL(N);
	GetSplitVector(N->getOperand(0), Lo, Hi);
	EVT InVT = Lo.getValueType();

	EVT OutVT = EVT::getVectorVT(*DAG.getContext(), ResVT.getVectorElementType(),
	InVT.getVectorNumElements());

	Lo = DAG.getNode(ISD::FP_ROUND, DL, OutVT, Lo, N->getOperand(1));
	Hi = DAG.getNode(ISD::FP_ROUND, DL, OutVT, Hi, N->getOperand(1));

	return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi);
	}

	SDValue DAGTypeLegalizer::SplitVecOp_FCOPYSIGN(SDNode *N) {
	// The result (and the first input) has a legal vector type, but the second
	// input needs splitting.
	return DAG.UnrollVectorOp(N, N->getValueType(0).getVectorNumElements());
	}


	//===----------------------------------------------------------------------===//
	// Result Vector Widening
	//===----------------------------------------------------------------------===//

	void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
	DEBUG(dbgs() << "Widen node result " << ResNo << ": ";
	N->dump(&DAG);
	dbgs() << "\n");

	// See if the target wants to custom widen this node.
	if (CustomWidenLowerNode(N, N->getValueType(ResNo)))
	return;

	SDValue Res = SDValue();
	switch (N->getOpcode()) {
	default:
	#ifndef NDEBUG
	dbgs() << "WidenVectorResult #" << ResNo << ": ";
	N->dump(&DAG);
	dbgs() << "\n";
	#endif
	llvm_unreachable("Do not know how to widen the result of this operator!");

	case ISD::MERGE_VALUES: Res = WidenVecRes_MERGE_VALUES(N, ResNo); break;
	case ISD::BITCAST: Res = WidenVecRes_BITCAST(N); break;
	case ISD::BUILD_VECTOR: Res = WidenVecRes_BUILD_VECTOR(N); break;
	case ISD::CONCAT_VECTORS: Res = WidenVecRes_CONCAT_VECTORS(N); break;
	case ISD::EXTRACT_SUBVECTOR: Res = WidenVecRes_EXTRACT_SUBVECTOR(N); break;
	case ISD::FP_ROUND_INREG: Res = WidenVecRes_InregOp(N); break;
	case ISD::INSERT_VECTOR_ELT: Res = WidenVecRes_INSERT_VECTOR_ELT(N); break;
	case ISD::LOAD: Res = WidenVecRes_LOAD(N); break;
	case ISD::SCALAR_TO_VECTOR: Res = WidenVecRes_SCALAR_TO_VECTOR(N); break;
	case ISD::SIGN_EXTEND_INREG: Res = WidenVecRes_InregOp(N); break;
	case ISD::VSELECT:
	case ISD::SELECT: Res = WidenVecRes_SELECT(N); break;
	case ISD::SELECT_CC: Res = WidenVecRes_SELECT_CC(N); break;
	case ISD::SETCC: Res = WidenVecRes_SETCC(N); break;
	case ISD::UNDEF: Res = WidenVecRes_UNDEF(N); break;
	case ISD::VECTOR_SHUFFLE:
	Res = WidenVecRes_VECTOR_SHUFFLE(cast<ShuffleVectorSDNode>(N));
	break;
	case ISD::MLOAD:
	Res = WidenVecRes_MLOAD(cast<MaskedLoadSDNode>(N));
	break;
	case ISD::MGATHER:
	Res = WidenVecRes_MGATHER(cast<MaskedGatherSDNode>(N));
	break;

	case ISD::ADD:
	case ISD::AND:
	case ISD::MUL:
	case ISD::MULHS:
	case ISD::MULHU:
	case ISD::OR:
	case ISD::SUB:
	case ISD::XOR:
	case ISD::FMINNUM:
	case ISD::FMAXNUM:
	case ISD::FMINNAN:
	case ISD::FMAXNAN:
	case ISD::SMIN:
	case ISD::SMAX:
	case ISD::UMIN:
	case ISD::UMAX:
	Res = WidenVecRes_Binary(N);
	break;

	case ISD::FADD:
	case ISD::FMUL:
	case ISD::FPOW:
	case ISD::FSUB:
	case ISD::FDIV:
	case ISD::FREM:
	case ISD::SDIV:
	case ISD::UDIV:
	case ISD::SREM:
	case ISD::UREM:
	Res = WidenVecRes_BinaryCanTrap(N);
	break;

	case ISD::FCOPYSIGN:
	Res = WidenVecRes_FCOPYSIGN(N);
	break;

	case ISD::FPOWI:
	Res = WidenVecRes_POWI(N);
	break;

	case ISD::SHL:
	case ISD::SRA:
	case ISD::SRL:
	Res = WidenVecRes_Shift(N);
	break;

	case ISD::ANY_EXTEND_VECTOR_INREG:
	case ISD::SIGN_EXTEND_VECTOR_INREG:
	case ISD::ZERO_EXTEND_VECTOR_INREG:
	Res = WidenVecRes_EXTEND_VECTOR_INREG(N);
	break;

	case ISD::ANY_EXTEND:
	case ISD::FP_EXTEND:
	case ISD::FP_ROUND:
	case ISD::FP_TO_SINT:
	case ISD::FP_TO_UINT:
	case ISD::SIGN_EXTEND:
	case ISD::SINT_TO_FP:
	case ISD::TRUNCATE:
	case ISD::UINT_TO_FP:
	case ISD::ZERO_EXTEND:
	Res = WidenVecRes_Convert(N);
	break;

	case ISD::BITREVERSE:
	case ISD::BSWAP:
	case ISD::CTLZ:
	case ISD::CTPOP:
	case ISD::CTTZ:
	case ISD::FABS:
	case ISD::FCEIL:
	case ISD::FCOS:
	case ISD::FEXP:
	case ISD::FEXP2:
	case ISD::FFLOOR:
	case ISD::FLOG:
	case ISD::FLOG10:
	case ISD::FLOG2:
	case ISD::FNEARBYINT:
	case ISD::FNEG:
	case ISD::FRINT:
	case ISD::FROUND:
	case ISD::FSIN:
	case ISD::FSQRT:
	case ISD::FTRUNC:
	Res = WidenVecRes_Unary(N);
	break;
	case ISD::FMA:
	Res = WidenVecRes_Ternary(N);
	break;
	}

	// If Res is null, the sub-method took care of registering the result.
	if (Res.getNode())
	SetWidenedVector(SDValue(N, ResNo), Res);
	}

	SDValue DAGTypeLegalizer::WidenVecRes_Ternary(SDNode *N) {
	// Ternary op widening.
	SDLoc dl(N);
	EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
	SDValue InOp1 = GetWidenedVector(N->getOperand(0));
	SDValue InOp2 = GetWidenedVector(N->getOperand(1));
	SDValue InOp3 = GetWidenedVector(N->getOperand(2));
	return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2, InOp3);
	}

	SDValue DAGTypeLegalizer::WidenVecRes_Binary(SDNode *N) {
	// Binary op widening.
	SDLoc dl(N);
	EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
	SDValue InOp1 = GetWidenedVector(N->getOperand(0));
	SDValue InOp2 = GetWidenedVector(N->getOperand(1));
	return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2, N->getFlags());
	}

	SDValue DAGTypeLegalizer::WidenVecRes_BinaryCanTrap(SDNode *N) {
	// Binary op widening for operations that can trap.
	unsigned Opcode = N->getOpcode();
	SDLoc dl(N);
	EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
	EVT WidenEltVT = WidenVT.getVectorElementType();
	EVT VT = WidenVT;
	unsigned NumElts = VT.getVectorNumElements();
	const SDNodeFlags Flags = N->getFlags();
	while (!TLI.isTypeLegal(VT) && NumElts != 1) {
	NumElts = NumElts / 2;
	VT = EVT::getVectorVT(*DAG.getContext(), WidenEltVT, NumElts);
	}

	if (NumElts != 1 && !TLI.canOpTrap(N->getOpcode(), VT)) {
	// Operation doesn't trap so just widen as normal.
	SDValue InOp1 = GetWidenedVector(N->getOperand(0));
	SDValue InOp2 = GetWidenedVector(N->getOperand(1));
	return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2, Flags);
	}

	// No legal vector version so unroll the vector operation and then widen.
	if (NumElts == 1)
	return DAG.UnrollVectorOp(N, WidenVT.getVectorNumElements());

	// Since the operation can trap, apply operation on the original vector.
	EVT MaxVT = VT;
	SDValue InOp1 = GetWidenedVector(N->getOperand(0));
	SDValue InOp2 = GetWidenedVector(N->getOperand(1));
	unsigned CurNumElts = N->getValueType(0).getVectorNumElements();

	SmallVector<SDValue, 16> ConcatOps(CurNumElts);
	unsigned ConcatEnd = 0; // Current ConcatOps index.
	int Idx = 0; // Current Idx into input vectors.

	// NumElts := greatest legal vector size (at most WidenVT)
	// while (orig. vector has unhandled elements) {
	// take munches of size NumElts from the beginning and add to ConcatOps
	// NumElts := next smaller supported vector size or 1
	// }
	while (CurNumElts != 0) {
	while (CurNumElts >= NumElts) {
	SDValue EOp1 = DAG.getNode(
	ISD::EXTRACT_SUBVECTOR, dl, VT, InOp1,
	DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
	SDValue EOp2 = DAG.getNode(
	ISD::EXTRACT_SUBVECTOR, dl, VT, InOp2,
	DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
	ConcatOps[ConcatEnd++] = DAG.getNode(Opcode, dl, VT, EOp1, EOp2, Flags);
	Idx += NumElts;
	CurNumElts -= NumElts;
	}
	do {
	NumElts = NumElts / 2;
	VT = EVT::getVectorVT(*DAG.getContext(), WidenEltVT, NumElts);
	} while (!TLI.isTypeLegal(VT) && NumElts != 1);

	if (NumElts == 1) {
	for (unsigned i = 0; i != CurNumElts; ++i, ++Idx) {
	SDValue EOp1 = DAG.getNode(
	ISD::EXTRACT_VECTOR_ELT, dl, WidenEltVT, InOp1,
	DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
	SDValue EOp2 = DAG.getNode(
	ISD::EXTRACT_VECTOR_ELT, dl, WidenEltVT, InOp2,
	DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
	ConcatOps[ConcatEnd++] = DAG.getNode(Opcode, dl, WidenEltVT,
	EOp1, EOp2, Flags);
	}
	CurNumElts = 0;
	}
	}

	// Check to see if we have a single operation with the widen type.
	if (ConcatEnd == 1) {
	VT = ConcatOps[0].getValueType();
	if (VT == WidenVT)
	return ConcatOps[0];
	}

	// while (Some element of ConcatOps is not of type MaxVT) {
	// From the end of ConcatOps, collect elements of the same type and put
	// them into an op of the next larger supported type
	// }
	while (ConcatOps[ConcatEnd-1].getValueType() != MaxVT) {
	Idx = ConcatEnd - 1;
	VT = ConcatOps[Idx--].getValueType();
	while (Idx >= 0 && ConcatOps[Idx].getValueType() == VT)
	Idx--;

	int NextSize = VT.isVector() ? VT.getVectorNumElements() : 1;
	EVT NextVT;
	do {
	NextSize *= 2;
	NextVT = EVT::getVectorVT(*DAG.getContext(), WidenEltVT, NextSize);
	} while (!TLI.isTypeLegal(NextVT));

	if (!VT.isVector()) {
	// Scalar type, create an INSERT_VECTOR_ELEMENT of type NextVT
	SDValue VecOp = DAG.getUNDEF(NextVT);
	unsigned NumToInsert = ConcatEnd - Idx - 1;
	for (unsigned i = 0, OpIdx = Idx+1; i < NumToInsert; i++, OpIdx++) {
	VecOp = DAG.getNode(
	ISD::INSERT_VECTOR_ELT, dl, NextVT, VecOp, ConcatOps[OpIdx],
	DAG.getConstant(i, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
	}
	ConcatOps[Idx+1] = VecOp;
	ConcatEnd = Idx + 2;
	} else {
	// Vector type, create a CONCAT_VECTORS of type NextVT
	SDValue undefVec = DAG.getUNDEF(VT);
	unsigned OpsToConcat = NextSize/VT.getVectorNumElements();
	SmallVector<SDValue, 16> SubConcatOps(OpsToConcat);
	unsigned RealVals = ConcatEnd - Idx - 1;
	unsigned SubConcatEnd = 0;
	unsigned SubConcatIdx = Idx + 1;
	while (SubConcatEnd < RealVals)
	SubConcatOps[SubConcatEnd++] = ConcatOps[++Idx];
	while (SubConcatEnd < OpsToConcat)
	SubConcatOps[SubConcatEnd++] = undefVec;
	ConcatOps[SubConcatIdx] = DAG.getNode(ISD::CONCAT_VECTORS, dl,
	NextVT, SubConcatOps);
	ConcatEnd = SubConcatIdx + 1;
	}
	}

	// Check to see if we have a single operation with the widen type.
	if (ConcatEnd == 1) {
	VT = ConcatOps[0].getValueType();
	if (VT == WidenVT)
	return ConcatOps[0];
	}

	// add undefs of size MaxVT until ConcatOps grows to length of WidenVT
	unsigned NumOps = WidenVT.getVectorNumElements()/MaxVT.getVectorNumElements();
	if (NumOps != ConcatEnd ) {
	SDValue UndefVal = DAG.getUNDEF(MaxVT);
	for (unsigned j = ConcatEnd; j < NumOps; ++j)
	ConcatOps[j] = UndefVal;
	}
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT,
	makeArrayRef(ConcatOps.data(), NumOps));
	}

	SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
	SDValue InOp = N->getOperand(0);
	SDLoc DL(N);

	EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
	unsigned WidenNumElts = WidenVT.getVectorNumElements();

	EVT InVT = InOp.getValueType();
	EVT InEltVT = InVT.getVectorElementType();
	EVT InWidenVT = EVT::getVectorVT(*DAG.getContext(), InEltVT, WidenNumElts);

	unsigned Opcode = N->getOpcode();
	unsigned InVTNumElts = InVT.getVectorNumElements();
	const SDNodeFlags Flags = N->getFlags();
	if (getTypeAction(InVT) == TargetLowering::TypeWidenVector) {
	InOp = GetWidenedVector(N->getOperand(0));
	InVT = InOp.getValueType();
	InVTNumElts = InVT.getVectorNumElements();
	if (InVTNumElts == WidenNumElts) {
	if (N->getNumOperands() == 1)
	return DAG.getNode(Opcode, DL, WidenVT, InOp);
	return DAG.getNode(Opcode, DL, WidenVT, InOp, N->getOperand(1), Flags);
	}
	if (WidenVT.getSizeInBits() == InVT.getSizeInBits()) {
	// If both input and result vector types are of same width, extend
	// operations should be done with SIGN/ZERO_EXTEND_VECTOR_INREG, which
	// accepts fewer elements in the result than in the input.
	if (Opcode == ISD::SIGN_EXTEND)
	return DAG.getSignExtendVectorInReg(InOp, DL, WidenVT);
	if (Opcode == ISD::ZERO_EXTEND)
	return DAG.getZeroExtendVectorInReg(InOp, DL, WidenVT);
	}
	}

	if (TLI.isTypeLegal(InWidenVT)) {
	// Because the result and the input are different vector types, widening
	// the result could create a legal type but widening the input might make
	// it an illegal type that might lead to repeatedly splitting the input
	// and then widening it. To avoid this, we widen the input only if
	// it results in a legal type.
	if (WidenNumElts % InVTNumElts == 0) {
	// Widen the input and call convert on the widened input vector.
	unsigned NumConcat = WidenNumElts/InVTNumElts;
	SmallVector<SDValue, 16> Ops(NumConcat);
	Ops[0] = InOp;
	SDValue UndefVal = DAG.getUNDEF(InVT);
	for (unsigned i = 1; i != NumConcat; ++i)
	Ops[i] = UndefVal;
	SDValue InVec = DAG.getNode(ISD::CONCAT_VECTORS, DL, InWidenVT, Ops);
	if (N->getNumOperands() == 1)
	return DAG.getNode(Opcode, DL, WidenVT, InVec);
	return DAG.getNode(Opcode, DL, WidenVT, InVec, N->getOperand(1), Flags);
	}

	if (InVTNumElts % WidenNumElts == 0) {
	SDValue InVal = DAG.getNode(
	ISD::EXTRACT_SUBVECTOR, DL, InWidenVT, InOp,
	DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
	// Extract the input and convert the shorten input vector.
	if (N->getNumOperands() == 1)
	return DAG.getNode(Opcode, DL, WidenVT, InVal);
	return DAG.getNode(Opcode, DL, WidenVT, InVal, N->getOperand(1), Flags);
	}
	}

	// Otherwise unroll into some nasty scalar code and rebuild the vector.
	SmallVector<SDValue, 16> Ops(WidenNumElts);
	EVT EltVT = WidenVT.getVectorElementType();
	unsigned MinElts = std::min(InVTNumElts, WidenNumElts);
	unsigned i;
	for (i=0; i < MinElts; ++i) {
	SDValue Val = DAG.getNode(
	ISD::EXTRACT_VECTOR_ELT, DL, InEltVT, InOp,
	DAG.getConstant(i, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
	if (N->getNumOperands() == 1)
	Ops[i] = DAG.getNode(Opcode, DL, EltVT, Val);
	else
	Ops[i] = DAG.getNode(Opcode, DL, EltVT, Val, N->getOperand(1), Flags);
	}

	SDValue UndefVal = DAG.getUNDEF(EltVT);
	for (; i < WidenNumElts; ++i)
	Ops[i] = UndefVal;

	return DAG.getBuildVector(WidenVT, DL, Ops);
	}

	SDValue DAGTypeLegalizer::WidenVecRes_EXTEND_VECTOR_INREG(SDNode *N) {
	unsigned Opcode = N->getOpcode();
	SDValue InOp = N->getOperand(0);
	SDLoc DL(N);

	EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
	EVT WidenSVT = WidenVT.getVectorElementType();
	unsigned WidenNumElts = WidenVT.getVectorNumElements();

	EVT InVT = InOp.getValueType();
	EVT InSVT = InVT.getVectorElementType();
	unsigned InVTNumElts = InVT.getVectorNumElements();

	if (getTypeAction(InVT) == TargetLowering::TypeWidenVector) {
	InOp = GetWidenedVector(InOp);
	InVT = InOp.getValueType();
	if (InVT.getSizeInBits() == WidenVT.getSizeInBits()) {
	switch (Opcode) {
	case ISD::ANY_EXTEND_VECTOR_INREG:
	return DAG.getAnyExtendVectorInReg(InOp, DL, WidenVT);
	case ISD::SIGN_EXTEND_VECTOR_INREG:
	return DAG.getSignExtendVectorInReg(InOp, DL, WidenVT);
	case ISD::ZERO_EXTEND_VECTOR_INREG:
	return DAG.getZeroExtendVectorInReg(InOp, DL, WidenVT);
	}
	}
	}

	// Unroll, extend the scalars and rebuild the vector.
	SmallVector<SDValue, 16> Ops;
	for (unsigned i = 0, e = std::min(InVTNumElts, WidenNumElts); i != e; ++i) {
	SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, InSVT, InOp,
	DAG.getConstant(i, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
	switch (Opcode) {
	case ISD::ANY_EXTEND_VECTOR_INREG:
	Val = DAG.getNode(ISD::ANY_EXTEND, DL, WidenSVT, Val);
	break;
	case ISD::SIGN_EXTEND_VECTOR_INREG:
	Val = DAG.getNode(ISD::SIGN_EXTEND, DL, WidenSVT, Val);
	break;
	case ISD::ZERO_EXTEND_VECTOR_INREG:
	Val = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenSVT, Val);
	break;
	default:
	llvm_unreachable("A *_EXTEND_VECTOR_INREG node was expected");
	}
	Ops.push_back(Val);
	}

	while (Ops.size() != WidenNumElts)
	Ops.push_back(DAG.getUNDEF(WidenSVT));

	return DAG.getBuildVector(WidenVT, DL, Ops);
	}

	SDValue DAGTypeLegalizer::WidenVecRes_FCOPYSIGN(SDNode *N) {
	// If this is an FCOPYSIGN with same input types, we can treat it as a
	// normal (can trap) binary op.
	if (N->getOperand(0).getValueType() == N->getOperand(1).getValueType())
	return WidenVecRes_BinaryCanTrap(N);

	// If the types are different, fall back to unrolling.
	EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
	return DAG.UnrollVectorOp(N, WidenVT.getVectorNumElements());
	}

	SDValue DAGTypeLegalizer::WidenVecRes_POWI(SDNode *N) {
	EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
	SDValue InOp = GetWidenedVector(N->getOperand(0));
	SDValue ShOp = N->getOperand(1);
	return DAG.getNode(N->getOpcode(), SDLoc(N), WidenVT, InOp, ShOp);
	}

	SDValue DAGTypeLegalizer::WidenVecRes_Shift(SDNode *N) {
	EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
	SDValue InOp = GetWidenedVector(N->getOperand(0));
	SDValue ShOp = N->getOperand(1);

	EVT ShVT = ShOp.getValueType();
	if (getTypeAction(ShVT) == TargetLowering::TypeWidenVector) {
	ShOp = GetWidenedVector(ShOp);
	ShVT = ShOp.getValueType();
	}
	EVT ShWidenVT = EVT::getVectorVT(*DAG.getContext(),
	ShVT.getVectorElementType(),
	WidenVT.getVectorNumElements());
	if (ShVT != ShWidenVT)
	ShOp = ModifyToType(ShOp, ShWidenVT);

	return DAG.getNode(N->getOpcode(), SDLoc(N), WidenVT, InOp, ShOp);
	}

	SDValue DAGTypeLegalizer::WidenVecRes_Unary(SDNode *N) {
	// Unary op widening.
	EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
	SDValue InOp = GetWidenedVector(N->getOperand(0));
	return DAG.getNode(N->getOpcode(), SDLoc(N), WidenVT, InOp);
	}

	SDValue DAGTypeLegalizer::WidenVecRes_InregOp(SDNode *N) {
	EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
	EVT ExtVT = EVT::getVectorVT(*DAG.getContext(),
	cast<VTSDNode>(N->getOperand(1))->getVT()
	.getVectorElementType(),
	WidenVT.getVectorNumElements());
	SDValue WidenLHS = GetWidenedVector(N->getOperand(0));
	return DAG.getNode(N->getOpcode(), SDLoc(N),
	WidenVT, WidenLHS, DAG.getValueType(ExtVT));
	}

	SDValue DAGTypeLegalizer::WidenVecRes_MERGE_VALUES(SDNode *N, unsigned ResNo) {
	SDValue WidenVec = DisintegrateMERGE_VALUES(N, ResNo);
	return GetWidenedVector(WidenVec);
	}

	SDValue DAGTypeLegalizer::WidenVecRes_BITCAST(SDNode *N) {
	SDValue InOp = N->getOperand(0);
	EVT InVT = InOp.getValueType();
	EVT VT = N->getValueType(0);
	EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
	SDLoc dl(N);

	switch (getTypeAction(InVT)) {
	case TargetLowering::TypeLegal:
	break;
	case TargetLowering::TypePromoteInteger:
	// If the incoming type is a vector that is being promoted, then
	// we know that the elements are arranged differently and that we
	// must perform the conversion using a stack slot.
	if (InVT.isVector())
	break;

	// If the InOp is promoted to the same size, convert it. Otherwise,
	// fall out of the switch and widen the promoted input.
	InOp = GetPromotedInteger(InOp);
	InVT = InOp.getValueType();
	if (WidenVT.bitsEq(InVT))
	return DAG.getNode(ISD::BITCAST, dl, WidenVT, InOp);
	break;
	case TargetLowering::TypeSoftenFloat:
	case TargetLowering::TypePromoteFloat:
	case TargetLowering::TypeExpandInteger:
	case TargetLowering::TypeExpandFloat:
	case TargetLowering::TypeScalarizeVector:
	case TargetLowering::TypeSplitVector:
	break;
	case TargetLowering::TypeWidenVector:
	// If the InOp is widened to the same size, convert it. Otherwise, fall
	// out of the switch and widen the widened input.
	InOp = GetWidenedVector(InOp);
	InVT = InOp.getValueType();
	if (WidenVT.bitsEq(InVT))
	// The input widens to the same size. Convert to the widen value.
	return DAG.getNode(ISD::BITCAST, dl, WidenVT, InOp);
	break;
	}

	unsigned WidenSize = WidenVT.getSizeInBits();
	unsigned InSize = InVT.getSizeInBits();
	// x86mmx is not an acceptable vector element type, so don't try.
	if (WidenSize % InSize == 0 && InVT != MVT::x86mmx) {
	// Determine new input vector type. The new input vector type will use
	// the same element type (if its a vector) or use the input type as a
	// vector. It is the same size as the type to widen to.
	EVT NewInVT;
	unsigned NewNumElts = WidenSize / InSize;
	if (InVT.isVector()) {
	EVT InEltVT = InVT.getVectorElementType();
	NewInVT = EVT::getVectorVT(*DAG.getContext(), InEltVT,
	WidenSize / InEltVT.getSizeInBits());
	} else {
	NewInVT = EVT::getVectorVT(*DAG.getContext(), InVT, NewNumElts);
	}

	if (TLI.isTypeLegal(NewInVT)) {
	// Because the result and the input are different vector types, widening
	// the result could create a legal type but widening the input might make
	// it an illegal type that might lead to repeatedly splitting the input
	// and then widening it. To avoid this, we widen the input only if
	// it results in a legal type.
	SmallVector<SDValue, 16> Ops(NewNumElts);
	SDValue UndefVal = DAG.getUNDEF(InVT);
	Ops[0] = InOp;
	for (unsigned i = 1; i < NewNumElts; ++i)
	Ops[i] = UndefVal;

	SDValue NewVec;
	if (InVT.isVector())
	NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewInVT, Ops);
	else
	NewVec = DAG.getBuildVector(NewInVT, dl, Ops);
	return DAG.getNode(ISD::BITCAST, dl, WidenVT, NewVec);
	}
	}

	return CreateStackStoreLoad(InOp, WidenVT);
	}

	SDValue DAGTypeLegalizer::WidenVecRes_BUILD_VECTOR(SDNode *N) {
	SDLoc dl(N);
	// Build a vector with undefined for the new nodes.
	EVT VT = N->getValueType(0);

	// Integer BUILD_VECTOR operands may be larger than the node's vector element
	// type. The UNDEFs need to have the same type as the existing operands.
	EVT EltVT = N->getOperand(0).getValueType();
	unsigned NumElts = VT.getVectorNumElements();

	EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
	unsigned WidenNumElts = WidenVT.getVectorNumElements();

	SmallVector<SDValue, 16> NewOps(N->op_begin(), N->op_end());
	assert(WidenNumElts >= NumElts && "Shrinking vector instead of widening!");
	NewOps.append(WidenNumElts - NumElts, DAG.getUNDEF(EltVT));

	return DAG.getBuildVector(WidenVT, dl, NewOps);
	}

	SDValue DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) {
	EVT InVT = N->getOperand(0).getValueType();
	EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
	SDLoc dl(N);
	unsigned WidenNumElts = WidenVT.getVectorNumElements();
	unsigned NumInElts = InVT.getVectorNumElements();
	unsigned NumOperands = N->getNumOperands();

	bool InputWidened = false; // Indicates we need to widen the input.
	if (getTypeAction(InVT) != TargetLowering::TypeWidenVector) {
	if (WidenVT.getVectorNumElements() % InVT.getVectorNumElements() == 0) {
	// Add undef vectors to widen to correct length.
	unsigned NumConcat = WidenVT.getVectorNumElements() /
	InVT.getVectorNumElements();
	SDValue UndefVal = DAG.getUNDEF(InVT);
	SmallVector<SDValue, 16> Ops(NumConcat);
	for (unsigned i=0; i < NumOperands; ++i)
	Ops[i] = N->getOperand(i);
	for (unsigned i = NumOperands; i != NumConcat; ++i)
	Ops[i] = UndefVal;
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, Ops);
	}
	} else {
	InputWidened = true;
	if (WidenVT == TLI.getTypeToTransformTo(*DAG.getContext(), InVT)) {
	// The inputs and the result are widen to the same value.
	unsigned i;
	for (i=1; i < NumOperands; ++i)
	if (!N->getOperand(i).isUndef())
	break;

	if (i == NumOperands)
	// Everything but the first operand is an UNDEF so just return the
	// widened first operand.
	return GetWidenedVector(N->getOperand(0));

	if (NumOperands == 2) {
	// Replace concat of two operands with a shuffle.
	SmallVector<int, 16> MaskOps(WidenNumElts, -1);
	for (unsigned i = 0; i < NumInElts; ++i) {
	MaskOps[i] = i;
	MaskOps[i + NumInElts] = i + WidenNumElts;
	}
	return DAG.getVectorShuffle(WidenVT, dl,
	GetWidenedVector(N->getOperand(0)),
	GetWidenedVector(N->getOperand(1)),
	MaskOps);
	}
	}
	}

	// Fall back to use extracts and build vector.
	EVT EltVT = WidenVT.getVectorElementType();
	SmallVector<SDValue, 16> Ops(WidenNumElts);
	unsigned Idx = 0;
	for (unsigned i=0; i < NumOperands; ++i) {
	SDValue InOp = N->getOperand(i);
	if (InputWidened)
	InOp = GetWidenedVector(InOp);
	for (unsigned j=0; j < NumInElts; ++j)
	Ops[Idx++] = DAG.getNode(
	ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
	DAG.getConstant(j, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
	}
	SDValue UndefVal = DAG.getUNDEF(EltVT);
	for (; Idx < WidenNumElts; ++Idx)
	Ops[Idx] = UndefVal;
	return DAG.getBuildVector(WidenVT, dl, Ops);
	}

	SDValue DAGTypeLegalizer::WidenVecRes_EXTRACT_SUBVECTOR(SDNode *N) {
	EVT VT = N->getValueType(0);
	EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
	unsigned WidenNumElts = WidenVT.getVectorNumElements();
	SDValue InOp = N->getOperand(0);
	SDValue Idx = N->getOperand(1);
	SDLoc dl(N);

	if (getTypeAction(InOp.getValueType()) == TargetLowering::TypeWidenVector)
	InOp = GetWidenedVector(InOp);

	EVT InVT = InOp.getValueType();

	// Check if we can just return the input vector after widening.
	uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
	if (IdxVal == 0 && InVT == WidenVT)
	return InOp;

	// Check if we can extract from the vector.
	unsigned InNumElts = InVT.getVectorNumElements();
	if (IdxVal % WidenNumElts == 0 && IdxVal + WidenNumElts < InNumElts)
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, WidenVT, InOp, Idx);

	// We could try widening the input to the right length but for now, extract
	// the original elements, fill the rest with undefs and build a vector.
	SmallVector<SDValue, 16> Ops(WidenNumElts);
	EVT EltVT = VT.getVectorElementType();
	unsigned NumElts = VT.getVectorNumElements();
	unsigned i;
	for (i=0; i < NumElts; ++i)
	Ops[i] =
	DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
	DAG.getConstant(IdxVal + i, dl,
	TLI.getVectorIdxTy(DAG.getDataLayout())));

	SDValue UndefVal = DAG.getUNDEF(EltVT);
	for (; i < WidenNumElts; ++i)
	Ops[i] = UndefVal;
	return DAG.getBuildVector(WidenVT, dl, Ops);
	}

	SDValue DAGTypeLegalizer::WidenVecRes_INSERT_VECTOR_ELT(SDNode *N) {
	SDValue InOp = GetWidenedVector(N->getOperand(0));
	return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(N),
	InOp.getValueType(), InOp,
	N->getOperand(1), N->getOperand(2));
	}

	SDValue DAGTypeLegalizer::WidenVecRes_LOAD(SDNode *N) {
	LoadSDNode *LD = cast<LoadSDNode>(N);
	ISD::LoadExtType ExtType = LD->getExtensionType();

	SDValue Result;
	SmallVector<SDValue, 16> LdChain; // Chain for the series of load
	if (ExtType != ISD::NON_EXTLOAD)
	Result = GenWidenVectorExtLoads(LdChain, LD, ExtType);
	else
	Result = GenWidenVectorLoads(LdChain, LD);

	// If we generate a single load, we can use that for the chain. Otherwise,
	// build a factor node to remember the multiple loads are independent and
	// chain to that.
	SDValue NewChain;
	if (LdChain.size() == 1)
	NewChain = LdChain[0];
	else
	NewChain = DAG.getNode(ISD::TokenFactor, SDLoc(LD), MVT::Other, LdChain);

	// Modified the chain - switch anything that used the old chain to use
	// the new one.
	ReplaceValueWith(SDValue(N, 1), NewChain);

	return Result;
	}

	SDValue DAGTypeLegalizer::WidenVecRes_MLOAD(MaskedLoadSDNode *N) {

	EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(),N->getValueType(0));
	SDValue Mask = N->getMask();
	EVT MaskVT = Mask.getValueType();
	SDValue Src0 = GetWidenedVector(N->getSrc0());
	ISD::LoadExtType ExtType = N->getExtensionType();
	SDLoc dl(N);

	if (getTypeAction(MaskVT) == TargetLowering::TypeWidenVector)
	Mask = GetWidenedVector(Mask);
	else {
	EVT BoolVT = getSetCCResultType(WidenVT);

	// We can't use ModifyToType() because we should fill the mask with
	// zeroes
	unsigned WidenNumElts = BoolVT.getVectorNumElements();
	unsigned MaskNumElts = MaskVT.getVectorNumElements();

	unsigned NumConcat = WidenNumElts / MaskNumElts;
	SmallVector<SDValue, 16> Ops(NumConcat);
	SDValue ZeroVal = DAG.getConstant(0, dl, MaskVT);
	Ops[0] = Mask;
	for (unsigned i = 1; i != NumConcat; ++i)
	Ops[i] = ZeroVal;

	Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, BoolVT, Ops);
	}

	SDValue Res = DAG.getMaskedLoad(WidenVT, dl, N->getChain(), N->getBasePtr(),
	Mask, Src0, N->getMemoryVT(),
	N->getMemOperand(), ExtType,
	N->isExpandingLoad());
	// Legalize the chain result - switch anything that used the old chain to
	// use the new one.
	ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
	return Res;
	}

	SDValue DAGTypeLegalizer::WidenVecRes_MGATHER(MaskedGatherSDNode *N) {

	EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
	SDValue Mask = N->getMask();
	SDValue Src0 = GetWidenedVector(N->getValue());
	unsigned NumElts = WideVT.getVectorNumElements();
	SDLoc dl(N);

	// The mask should be widened as well
	Mask = WidenTargetBoolean(Mask, WideVT, true);

	// Widen the Index operand
	SDValue Index = N->getIndex();
	EVT WideIndexVT = EVT::getVectorVT(*DAG.getContext(),
	Index.getValueType().getScalarType(),
	NumElts);
	Index = ModifyToType(Index, WideIndexVT);
	SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
	SDValue Res = DAG.getMaskedGather(DAG.getVTList(WideVT, MVT::Other),
	N->getMemoryVT(), dl, Ops,
	N->getMemOperand());

	// Legalize the chain result - switch anything that used the old chain to
	// use the new one.
	ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
	return Res;
	}

	SDValue DAGTypeLegalizer::WidenVecRes_SCALAR_TO_VECTOR(SDNode *N) {
	EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
	return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N),
	WidenVT, N->getOperand(0));
	}

	// Return true if this is a node that could have two SETCCs as operands.
	static inline bool isLogicalMaskOp(unsigned Opcode) {
	switch (Opcode) {
	case ISD::AND:
	case ISD::OR:
	case ISD::XOR:
	return true;
	}
	return false;
	}

	// This is used just for the assert in convertMask(). Check that this either
	// a SETCC or a previously handled SETCC by convertMask().
	#ifndef NDEBUG
	static inline bool isSETCCorConvertedSETCC(SDValue N) {
	if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR)
	N = N.getOperand(0);
	else if (N.getOpcode() == ISD::CONCAT_VECTORS) {
	for (unsigned i = 1; i < N->getNumOperands(); ++i)
	if (!N->getOperand(i)->isUndef())
	return false;
	N = N.getOperand(0);
	}

	if (N.getOpcode() == ISD::TRUNCATE)
	N = N.getOperand(0);
	else if (N.getOpcode() == ISD::SIGN_EXTEND)
	N = N.getOperand(0);

	if (isLogicalMaskOp(N.getOpcode()))
	return isSETCCorConvertedSETCC(N.getOperand(0)) &&
	isSETCCorConvertedSETCC(N.getOperand(1));

	return (N.getOpcode() == ISD::SETCC \|\|
	ISD::isBuildVectorOfConstantSDNodes(N.getNode()));
	}
	#endif

	// Return a mask of vector type MaskVT to replace InMask. Also adjust MaskVT
	// to ToMaskVT if needed with vector extension or truncation.
	SDValue DAGTypeLegalizer::convertMask(SDValue InMask, EVT MaskVT,
	EVT ToMaskVT) {
	// Currently a SETCC or a AND/OR/XOR with two SETCCs are handled.
	// FIXME: This code seems to be too restrictive, we might consider
	// generalizing it or dropping it.
	assert(isSETCCorConvertedSETCC(InMask) && "Unexpected mask argument.");

	// Make a new Mask node, with a legal result VT.
	SmallVector<SDValue, 4> Ops;
	for (unsigned i = 0; i < InMask->getNumOperands(); ++i)
	Ops.push_back(InMask->getOperand(i));
	SDValue Mask = DAG.getNode(InMask->getOpcode(), SDLoc(InMask), MaskVT, Ops);

	// If MaskVT has smaller or bigger elements than ToMaskVT, a vector sign
	// extend or truncate is needed.
	LLVMContext &Ctx = *DAG.getContext();
	unsigned MaskScalarBits = MaskVT.getScalarSizeInBits();
	unsigned ToMaskScalBits = ToMaskVT.getScalarSizeInBits();
	if (MaskScalarBits < ToMaskScalBits) {
	EVT ExtVT = EVT::getVectorVT(Ctx, ToMaskVT.getVectorElementType(),
	MaskVT.getVectorNumElements());
	Mask = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(Mask), ExtVT, Mask);
	} else if (MaskScalarBits > ToMaskScalBits) {
	EVT TruncVT = EVT::getVectorVT(Ctx, ToMaskVT.getVectorElementType(),
	MaskVT.getVectorNumElements());
	Mask = DAG.getNode(ISD::TRUNCATE, SDLoc(Mask), TruncVT, Mask);
	}

	assert(Mask->getValueType(0).getScalarSizeInBits() ==
	ToMaskVT.getScalarSizeInBits() &&
	"Mask should have the right element size by now.");

	// Adjust Mask to the right number of elements.
	unsigned CurrMaskNumEls = Mask->getValueType(0).getVectorNumElements();
	if (CurrMaskNumEls > ToMaskVT.getVectorNumElements()) {
	MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout());
	SDValue ZeroIdx = DAG.getConstant(0, SDLoc(Mask), IdxTy);
	Mask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Mask), ToMaskVT, Mask,
	ZeroIdx);
	} else if (CurrMaskNumEls < ToMaskVT.getVectorNumElements()) {
	unsigned NumSubVecs = (ToMaskVT.getVectorNumElements() / CurrMaskNumEls);
	EVT SubVT = Mask->getValueType(0);
	SmallVector<SDValue, 16> SubConcatOps(NumSubVecs);
	SubConcatOps[0] = Mask;
	for (unsigned i = 1; i < NumSubVecs; ++i)
	SubConcatOps[i] = DAG.getUNDEF(SubVT);
	Mask =
	DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Mask), ToMaskVT, SubConcatOps);
	}

	assert((Mask->getValueType(0) == ToMaskVT) &&
	"A mask of ToMaskVT should have been produced by now.");

	return Mask;
	}

	// Get the target mask VT, and widen if needed.
	EVT DAGTypeLegalizer::getSETCCWidenedResultTy(SDValue SetCC) {
	assert(SetCC->getOpcode() == ISD::SETCC);
	LLVMContext &Ctx = *DAG.getContext();
	EVT MaskVT = getSetCCResultType(SetCC->getOperand(0).getValueType());
	if (getTypeAction(MaskVT) == TargetLowering::TypeWidenVector)
	MaskVT = TLI.getTypeToTransformTo(Ctx, MaskVT);
	return MaskVT;
	}

	// This method tries to handle VSELECT and its mask by legalizing operands
	// (which may require widening) and if needed adjusting the mask vector type
	// to match that of the VSELECT. Without it, many cases end up with
	// scalarization of the SETCC, with many unnecessary instructions.
	SDValue DAGTypeLegalizer::WidenVSELECTAndMask(SDNode *N) {
	LLVMContext &Ctx = *DAG.getContext();
	SDValue Cond = N->getOperand(0);

	if (N->getOpcode() != ISD::VSELECT)
	return SDValue();

	if (Cond->getOpcode() != ISD::SETCC && !isLogicalMaskOp(Cond->getOpcode()))
	return SDValue();

	// If this is a splitted VSELECT that was previously already handled, do
	// nothing.
	if (Cond->getValueType(0).getScalarSizeInBits() != 1)
	return SDValue();

	EVT VSelVT = N->getValueType(0);
	// Only handle vector types which are a power of 2.
	if (!isPowerOf2_64(VSelVT.getSizeInBits()))
	return SDValue();

	// Don't touch if this will be scalarized.
	EVT FinalVT = VSelVT;
	while (getTypeAction(FinalVT) == TargetLowering::TypeSplitVector)
	FinalVT = FinalVT.getHalfNumVectorElementsVT(Ctx);

	if (FinalVT.getVectorNumElements() == 1)
	return SDValue();

	// If there is support for an i1 vector mask, don't touch.
	if (Cond.getOpcode() == ISD::SETCC) {
	EVT SetCCOpVT = Cond->getOperand(0).getValueType();
	while (TLI.getTypeAction(Ctx, SetCCOpVT) != TargetLowering::TypeLegal)
	SetCCOpVT = TLI.getTypeToTransformTo(Ctx, SetCCOpVT);
	EVT SetCCResVT = getSetCCResultType(SetCCOpVT);
	if (SetCCResVT.getScalarSizeInBits() == 1)
	return SDValue();
	}

	// Get the VT and operands for VSELECT, and widen if needed.
	SDValue VSelOp1 = N->getOperand(1);
	SDValue VSelOp2 = N->getOperand(2);
	if (getTypeAction(VSelVT) == TargetLowering::TypeWidenVector) {
	VSelVT = TLI.getTypeToTransformTo(Ctx, VSelVT);
	VSelOp1 = GetWidenedVector(VSelOp1);
	VSelOp2 = GetWidenedVector(VSelOp2);
	}

	// The mask of the VSELECT should have integer elements.
	EVT ToMaskVT = VSelVT;
	if (!ToMaskVT.getScalarType().isInteger())
	ToMaskVT = ToMaskVT.changeVectorElementTypeToInteger();

	SDValue Mask;
	if (Cond->getOpcode() == ISD::SETCC) {
	EVT MaskVT = getSETCCWidenedResultTy(Cond);
	Mask = convertMask(Cond, MaskVT, ToMaskVT);
	} else if (isLogicalMaskOp(Cond->getOpcode()) &&
	Cond->getOperand(0).getOpcode() == ISD::SETCC &&
	Cond->getOperand(1).getOpcode() == ISD::SETCC) {
	// Cond is (AND/OR/XOR (SETCC, SETCC))
	SDValue SETCC0 = Cond->getOperand(0);
	SDValue SETCC1 = Cond->getOperand(1);
	EVT VT0 = getSETCCWidenedResultTy(SETCC0);
	EVT VT1 = getSETCCWidenedResultTy(SETCC1);
	unsigned ScalarBits0 = VT0.getScalarSizeInBits();
	unsigned ScalarBits1 = VT1.getScalarSizeInBits();
	unsigned ScalarBits_ToMask = ToMaskVT.getScalarSizeInBits();
	EVT MaskVT;
	// If the two SETCCs have different VTs, either extend/truncate one of
	// them to the other "towards" ToMaskVT, or truncate one and extend the
	// other to ToMaskVT.
	if (ScalarBits0 != ScalarBits1) {
	EVT NarrowVT = ((ScalarBits0 < ScalarBits1) ? VT0 : VT1);
	EVT WideVT = ((NarrowVT == VT0) ? VT1 : VT0);
	if (ScalarBits_ToMask >= WideVT.getScalarSizeInBits())
	MaskVT = WideVT;
	else if (ScalarBits_ToMask <= NarrowVT.getScalarSizeInBits())
	MaskVT = NarrowVT;
	else
	MaskVT = ToMaskVT;
	} else
	// If the two SETCCs have the same VT, don't change it.
	MaskVT = VT0;

	// Make new SETCCs and logical nodes.
	SETCC0 = convertMask(SETCC0, VT0, MaskVT);
	SETCC1 = convertMask(SETCC1, VT1, MaskVT);
	Cond = DAG.getNode(Cond->getOpcode(), SDLoc(Cond), MaskVT, SETCC0, SETCC1);

	// Convert the logical op for VSELECT if needed.
	Mask = convertMask(Cond, MaskVT, ToMaskVT);
	} else
	return SDValue();

	return DAG.getNode(ISD::VSELECT, SDLoc(N), VSelVT, Mask, VSelOp1, VSelOp2);
	}

	SDValue DAGTypeLegalizer::WidenVecRes_SELECT(SDNode *N) {
	EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
	unsigned WidenNumElts = WidenVT.getVectorNumElements();

	SDValue Cond1 = N->getOperand(0);
	EVT CondVT = Cond1.getValueType();
	if (CondVT.isVector()) {
	if (SDValue Res = WidenVSELECTAndMask(N))
	return Res;

	EVT CondEltVT = CondVT.getVectorElementType();
	EVT CondWidenVT = EVT::getVectorVT(*DAG.getContext(),
	CondEltVT, WidenNumElts);
	if (getTypeAction(CondVT) == TargetLowering::TypeWidenVector)
	Cond1 = GetWidenedVector(Cond1);

	// If we have to split the condition there is no point in widening the
	// select. This would result in an cycle of widening the select ->
	// widening the condition operand -> splitting the condition operand ->
	// splitting the select -> widening the select. Instead split this select
	// further and widen the resulting type.
	if (getTypeAction(CondVT) == TargetLowering::TypeSplitVector) {
	SDValue SplitSelect = SplitVecOp_VSELECT(N, 0);
	SDValue Res = ModifyToType(SplitSelect, WidenVT);
	return Res;
	}

	if (Cond1.getValueType() != CondWidenVT)
	Cond1 = ModifyToType(Cond1, CondWidenVT);
	}

	SDValue InOp1 = GetWidenedVector(N->getOperand(1));
	SDValue InOp2 = GetWidenedVector(N->getOperand(2));
	assert(InOp1.getValueType() == WidenVT && InOp2.getValueType() == WidenVT);
	return DAG.getNode(N->getOpcode(), SDLoc(N),
	WidenVT, Cond1, InOp1, InOp2);
	}

	SDValue DAGTypeLegalizer::WidenVecRes_SELECT_CC(SDNode *N) {
	SDValue InOp1 = GetWidenedVector(N->getOperand(2));
	SDValue InOp2 = GetWidenedVector(N->getOperand(3));
	return DAG.getNode(ISD::SELECT_CC, SDLoc(N),
	InOp1.getValueType(), N->getOperand(0),
	N->getOperand(1), InOp1, InOp2, N->getOperand(4));
	}

	SDValue DAGTypeLegalizer::WidenVecRes_SETCC(SDNode *N) {
	assert(N->getValueType(0).isVector() ==
	N->getOperand(0).getValueType().isVector() &&
	"Scalar/Vector type mismatch");
	if (N->getValueType(0).isVector()) return WidenVecRes_VSETCC(N);

	EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
	SDValue InOp1 = GetWidenedVector(N->getOperand(0));
	SDValue InOp2 = GetWidenedVector(N->getOperand(1));
	return DAG.getNode(ISD::SETCC, SDLoc(N), WidenVT,
	InOp1, InOp2, N->getOperand(2));
	}

	SDValue DAGTypeLegalizer::WidenVecRes_UNDEF(SDNode *N) {
	EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
	return DAG.getUNDEF(WidenVT);
	}

	SDValue DAGTypeLegalizer::WidenVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N) {
	EVT VT = N->getValueType(0);
	SDLoc dl(N);

	EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
	unsigned NumElts = VT.getVectorNumElements();
	unsigned WidenNumElts = WidenVT.getVectorNumElements();

	SDValue InOp1 = GetWidenedVector(N->getOperand(0));
	SDValue InOp2 = GetWidenedVector(N->getOperand(1));

	// Adjust mask based on new input vector length.
	SmallVector<int, 16> NewMask;
	for (unsigned i = 0; i != NumElts; ++i) {
	int Idx = N->getMaskElt(i);
	if (Idx < (int)NumElts)
	NewMask.push_back(Idx);
	else
	NewMask.push_back(Idx - NumElts + WidenNumElts);
	}
	for (unsigned i = NumElts; i != WidenNumElts; ++i)
	NewMask.push_back(-1);
	return DAG.getVectorShuffle(WidenVT, dl, InOp1, InOp2, NewMask);
	}

	SDValue DAGTypeLegalizer::WidenVecRes_VSETCC(SDNode *N) {
	assert(N->getValueType(0).isVector() &&
	N->getOperand(0).getValueType().isVector() &&
	"Operands must be vectors");
	EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
	unsigned WidenNumElts = WidenVT.getVectorNumElements();

	SDValue InOp1 = N->getOperand(0);
	EVT InVT = InOp1.getValueType();
	assert(InVT.isVector() && "can not widen non-vector type");
	EVT WidenInVT = EVT::getVectorVT(*DAG.getContext(),
	InVT.getVectorElementType(), WidenNumElts);

	// The input and output types often differ here, and it could be that while
	// we'd prefer to widen the result type, the input operands have been split.
	// In this case, we also need to split the result of this node as well.
	if (getTypeAction(InVT) == TargetLowering::TypeSplitVector) {
	SDValue SplitVSetCC = SplitVecOp_VSETCC(N);
	SDValue Res = ModifyToType(SplitVSetCC, WidenVT);
	return Res;
	}

	InOp1 = GetWidenedVector(InOp1);
	SDValue InOp2 = GetWidenedVector(N->getOperand(1));

	// Assume that the input and output will be widen appropriately. If not,
	// we will have to unroll it at some point.
	assert(InOp1.getValueType() == WidenInVT &&
	InOp2.getValueType() == WidenInVT &&
	"Input not widened to expected type!");
	(void)WidenInVT;
	return DAG.getNode(ISD::SETCC, SDLoc(N),
	WidenVT, InOp1, InOp2, N->getOperand(2));
	}


	//===----------------------------------------------------------------------===//
	// Widen Vector Operand
	//===----------------------------------------------------------------------===//
	bool DAGTypeLegalizer::WidenVectorOperand(SDNode *N, unsigned OpNo) {
	DEBUG(dbgs() << "Widen node operand " << OpNo << ": ";
	N->dump(&DAG);
	dbgs() << "\n");
	SDValue Res = SDValue();

	// See if the target wants to custom widen this node.
	if (CustomLowerNode(N, N->getOperand(OpNo).getValueType(), false))
	return false;

	switch (N->getOpcode()) {
	default:
	#ifndef NDEBUG
	dbgs() << "WidenVectorOperand op #" << OpNo << ": ";
	N->dump(&DAG);
	dbgs() << "\n";
	#endif
	llvm_unreachable("Do not know how to widen this operator's operand!");

	case ISD::BITCAST: Res = WidenVecOp_BITCAST(N); break;
	case ISD::CONCAT_VECTORS: Res = WidenVecOp_CONCAT_VECTORS(N); break;
	case ISD::EXTRACT_SUBVECTOR: Res = WidenVecOp_EXTRACT_SUBVECTOR(N); break;
	case ISD::EXTRACT_VECTOR_ELT: Res = WidenVecOp_EXTRACT_VECTOR_ELT(N); break;
	case ISD::STORE: Res = WidenVecOp_STORE(N); break;
	case ISD::MSTORE: Res = WidenVecOp_MSTORE(N, OpNo); break;
	case ISD::MSCATTER: Res = WidenVecOp_MSCATTER(N, OpNo); break;
	case ISD::SETCC: Res = WidenVecOp_SETCC(N); break;
	case ISD::FCOPYSIGN: Res = WidenVecOp_FCOPYSIGN(N); break;

	case ISD::ANY_EXTEND:
	case ISD::SIGN_EXTEND:
	case ISD::ZERO_EXTEND:
	Res = WidenVecOp_EXTEND(N);
	break;

	case ISD::FP_EXTEND:
	case ISD::FP_TO_SINT:
	case ISD::FP_TO_UINT:
	case ISD::SINT_TO_FP:
	case ISD::UINT_TO_FP:
	case ISD::TRUNCATE:
	Res = WidenVecOp_Convert(N);
	break;
	}

	// If Res is null, the sub-method took care of registering the result.
	if (!Res.getNode()) return false;

	// If the result is N, the sub-method updated N in place. Tell the legalizer
	// core about this.
	if (Res.getNode() == N)
	return true;


	assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 &&
	"Invalid operand expansion");

	ReplaceValueWith(SDValue(N, 0), Res);
	return false;
	}

	SDValue DAGTypeLegalizer::WidenVecOp_EXTEND(SDNode *N) {
	SDLoc DL(N);
	EVT VT = N->getValueType(0);

	SDValue InOp = N->getOperand(0);
	// If some legalization strategy other than widening is used on the operand,
	// we can't safely assume that just extending the low lanes is the correct
	// transformation.
	if (getTypeAction(InOp.getValueType()) != TargetLowering::TypeWidenVector)
	return WidenVecOp_Convert(N);
	InOp = GetWidenedVector(InOp);
	assert(VT.getVectorNumElements() <
	InOp.getValueType().getVectorNumElements() &&
	"Input wasn't widened!");

	// We may need to further widen the operand until it has the same total
	// vector size as the result.
	EVT InVT = InOp.getValueType();
	if (InVT.getSizeInBits() != VT.getSizeInBits()) {
	EVT InEltVT = InVT.getVectorElementType();
	for (int i = MVT::FIRST_VECTOR_VALUETYPE, e = MVT::LAST_VECTOR_VALUETYPE; i < e; ++i) {
	EVT FixedVT = (MVT::SimpleValueType)i;
	EVT FixedEltVT = FixedVT.getVectorElementType();
	if (TLI.isTypeLegal(FixedVT) &&
	FixedVT.getSizeInBits() == VT.getSizeInBits() &&
	FixedEltVT == InEltVT) {
	assert(FixedVT.getVectorNumElements() >= VT.getVectorNumElements() &&
	"Not enough elements in the fixed type for the operand!");
	assert(FixedVT.getVectorNumElements() != InVT.getVectorNumElements() &&
	"We can't have the same type as we started with!");
	if (FixedVT.getVectorNumElements() > InVT.getVectorNumElements())
	InOp = DAG.getNode(
	ISD::INSERT_SUBVECTOR, DL, FixedVT, DAG.getUNDEF(FixedVT), InOp,
	DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
	else
	InOp = DAG.getNode(
	ISD::EXTRACT_SUBVECTOR, DL, FixedVT, InOp,
	DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
	break;
	}
	}
	InVT = InOp.getValueType();
	if (InVT.getSizeInBits() != VT.getSizeInBits())
	// We couldn't find a legal vector type that was a widening of the input
	// and could be extended in-register to the result type, so we have to
	// scalarize.
	return WidenVecOp_Convert(N);
	}

	// Use special DAG nodes to represent the operation of extending the
	// low lanes.
	switch (N->getOpcode()) {
	default:
	llvm_unreachable("Extend legalization on on extend operation!");
	case ISD::ANY_EXTEND:
	return DAG.getAnyExtendVectorInReg(InOp, DL, VT);
	case ISD::SIGN_EXTEND:
	return DAG.getSignExtendVectorInReg(InOp, DL, VT);
	case ISD::ZERO_EXTEND:
	return DAG.getZeroExtendVectorInReg(InOp, DL, VT);
	}
	}

	SDValue DAGTypeLegalizer::WidenVecOp_FCOPYSIGN(SDNode *N) {
	// The result (and first input) is legal, but the second input is illegal.
	// We can't do much to fix that, so just unroll and let the extracts off of
	// the second input be widened as needed later.
	return DAG.UnrollVectorOp(N);
	}

	SDValue DAGTypeLegalizer::WidenVecOp_Convert(SDNode *N) {
	// Since the result is legal and the input is illegal, it is unlikely that we
	// can fix the input to a legal type so unroll the convert into some scalar
	// code and create a nasty build vector.
	EVT VT = N->getValueType(0);
	EVT EltVT = VT.getVectorElementType();
	SDLoc dl(N);
	unsigned NumElts = VT.getVectorNumElements();
	SDValue InOp = N->getOperand(0);
	if (getTypeAction(InOp.getValueType()) == TargetLowering::TypeWidenVector)
	InOp = GetWidenedVector(InOp);
	EVT InVT = InOp.getValueType();
	EVT InEltVT = InVT.getVectorElementType();

	unsigned Opcode = N->getOpcode();
	SmallVector<SDValue, 16> Ops(NumElts);
	for (unsigned i=0; i < NumElts; ++i)
	Ops[i] = DAG.getNode(
	Opcode, dl, EltVT,
	DAG.getNode(
	ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, InOp,
	DAG.getConstant(i, dl, TLI.getVectorIdxTy(DAG.getDataLayout()))));

	return DAG.getBuildVector(VT, dl, Ops);
	}

	SDValue DAGTypeLegalizer::WidenVecOp_BITCAST(SDNode *N) {
	EVT VT = N->getValueType(0);
	SDValue InOp = GetWidenedVector(N->getOperand(0));
	EVT InWidenVT = InOp.getValueType();
	SDLoc dl(N);

	// Check if we can convert between two legal vector types and extract.
	unsigned InWidenSize = InWidenVT.getSizeInBits();
	unsigned Size = VT.getSizeInBits();
	// x86mmx is not an acceptable vector element type, so don't try.
	if (InWidenSize % Size == 0 && !VT.isVector() && VT != MVT::x86mmx) {
	unsigned NewNumElts = InWidenSize / Size;
	EVT NewVT = EVT::getVectorVT(*DAG.getContext(), VT, NewNumElts);
	if (TLI.isTypeLegal(NewVT)) {
	SDValue BitOp = DAG.getNode(ISD::BITCAST, dl, NewVT, InOp);
	return DAG.getNode(
	ISD::EXTRACT_VECTOR_ELT, dl, VT, BitOp,
	DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
	}
	}

	return CreateStackStoreLoad(InOp, VT);
	}

	SDValue DAGTypeLegalizer::WidenVecOp_CONCAT_VECTORS(SDNode *N) {
	// If the input vector is not legal, it is likely that we will not find a
	// legal vector of the same size. Replace the concatenate vector with a
	// nasty build vector.
	EVT VT = N->getValueType(0);
	EVT EltVT = VT.getVectorElementType();
	SDLoc dl(N);
	unsigned NumElts = VT.getVectorNumElements();
	SmallVector<SDValue, 16> Ops(NumElts);

	EVT InVT = N->getOperand(0).getValueType();
	unsigned NumInElts = InVT.getVectorNumElements();

	unsigned Idx = 0;
	unsigned NumOperands = N->getNumOperands();
	for (unsigned i=0; i < NumOperands; ++i) {
	SDValue InOp = N->getOperand(i);
	if (getTypeAction(InOp.getValueType()) == TargetLowering::TypeWidenVector)
	InOp = GetWidenedVector(InOp);
	for (unsigned j=0; j < NumInElts; ++j)
	Ops[Idx++] = DAG.getNode(
	ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
	DAG.getConstant(j, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
	}
	return DAG.getBuildVector(VT, dl, Ops);
	}

	SDValue DAGTypeLegalizer::WidenVecOp_EXTRACT_SUBVECTOR(SDNode *N) {
	SDValue InOp = GetWidenedVector(N->getOperand(0));
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N),
	N->getValueType(0), InOp, N->getOperand(1));
	}

	SDValue DAGTypeLegalizer::WidenVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
	SDValue InOp = GetWidenedVector(N->getOperand(0));
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N),
	N->getValueType(0), InOp, N->getOperand(1));
	}

	SDValue DAGTypeLegalizer::WidenVecOp_STORE(SDNode *N) {
	// We have to widen the value, but we want only to store the original
	// vector type.
	StoreSDNode *ST = cast<StoreSDNode>(N);

	SmallVector<SDValue, 16> StChain;
	if (ST->isTruncatingStore())
	GenWidenVectorTruncStores(StChain, ST);
	else
	GenWidenVectorStores(StChain, ST);

	if (StChain.size() == 1)
	return StChain[0];
	else
	return DAG.getNode(ISD::TokenFactor, SDLoc(ST), MVT::Other, StChain);
	}

	SDValue DAGTypeLegalizer::WidenVecOp_MSTORE(SDNode *N, unsigned OpNo) {
	MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N);
	SDValue Mask = MST->getMask();
	EVT MaskVT = Mask.getValueType();
	SDValue StVal = MST->getValue();
	// Widen the value
	SDValue WideVal = GetWidenedVector(StVal);
	SDLoc dl(N);

	if (OpNo == 2 \|\| getTypeAction(MaskVT) == TargetLowering::TypeWidenVector)
	Mask = GetWidenedVector(Mask);
	else {
	// The mask should be widened as well.
	EVT BoolVT = getSetCCResultType(WideVal.getValueType());
	// We can't use ModifyToType() because we should fill the mask with
	// zeroes.
	unsigned WidenNumElts = BoolVT.getVectorNumElements();
	unsigned MaskNumElts = MaskVT.getVectorNumElements();

	unsigned NumConcat = WidenNumElts / MaskNumElts;
	SmallVector<SDValue, 16> Ops(NumConcat);
	SDValue ZeroVal = DAG.getConstant(0, dl, MaskVT);
	Ops[0] = Mask;
	for (unsigned i = 1; i != NumConcat; ++i)
	Ops[i] = ZeroVal;

	Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, BoolVT, Ops);
	}
	assert(Mask.getValueType().getVectorNumElements() ==
	WideVal.getValueType().getVectorNumElements() &&
	"Mask and data vectors should have the same number of elements");
	return DAG.getMaskedStore(MST->getChain(), dl, WideVal, MST->getBasePtr(),
	Mask, MST->getMemoryVT(), MST->getMemOperand(),
	false, MST->isCompressingStore());
	}

	SDValue DAGTypeLegalizer::WidenVecOp_MSCATTER(SDNode *N, unsigned OpNo) {
	assert(OpNo == 1 && "Can widen only data operand of mscatter");
	MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(N);
	SDValue DataOp = MSC->getValue();
	SDValue Mask = MSC->getMask();

	// Widen the value.
	SDValue WideVal = GetWidenedVector(DataOp);
	EVT WideVT = WideVal.getValueType();
	unsigned NumElts = WideVal.getValueType().getVectorNumElements();
	SDLoc dl(N);

	// The mask should be widened as well.
	Mask = WidenTargetBoolean(Mask, WideVT, true);

	// Widen index.
	SDValue Index = MSC->getIndex();
	EVT WideIndexVT = EVT::getVectorVT(*DAG.getContext(),
	Index.getValueType().getScalarType(),
	NumElts);
	Index = ModifyToType(Index, WideIndexVT);

	SDValue Ops[] = {MSC->getChain(), WideVal, Mask, MSC->getBasePtr(), Index};
	return DAG.getMaskedScatter(DAG.getVTList(MVT::Other),
	MSC->getMemoryVT(), dl, Ops,
	MSC->getMemOperand());
	}

	SDValue DAGTypeLegalizer::WidenVecOp_SETCC(SDNode *N) {
	SDValue InOp0 = GetWidenedVector(N->getOperand(0));
	SDValue InOp1 = GetWidenedVector(N->getOperand(1));
	SDLoc dl(N);

	// WARNING: In this code we widen the compare instruction with garbage.
	// This garbage may contain denormal floats which may be slow. Is this a real
	// concern ? Should we zero the unused lanes if this is a float compare ?

	// Get a new SETCC node to compare the newly widened operands.
	// Only some of the compared elements are legal.
	EVT SVT = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
	InOp0.getValueType());
	SDValue WideSETCC = DAG.getNode(ISD::SETCC, SDLoc(N),
	SVT, InOp0, InOp1, N->getOperand(2));

	// Extract the needed results from the result vector.
	EVT ResVT = EVT::getVectorVT(*DAG.getContext(),
	SVT.getVectorElementType(),
	N->getValueType(0).getVectorNumElements());
	SDValue CC = DAG.getNode(
	ISD::EXTRACT_SUBVECTOR, dl, ResVT, WideSETCC,
	DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));

	return PromoteTargetBoolean(CC, N->getValueType(0));
	}


	//===----------------------------------------------------------------------===//
	// Vector Widening Utilities
	//===----------------------------------------------------------------------===//

	// Utility function to find the type to chop up a widen vector for load/store
	// TLI: Target lowering used to determine legal types.
	// Width: Width left need to load/store.
	// WidenVT: The widen vector type to load to/store from
	// Align: If 0, don't allow use of a wider type
	// WidenEx: If Align is not 0, the amount additional we can load/store from.

	static EVT FindMemType(SelectionDAG& DAG, const TargetLowering &TLI,
	unsigned Width, EVT WidenVT,
	unsigned Align = 0, unsigned WidenEx = 0) {
	EVT WidenEltVT = WidenVT.getVectorElementType();
	unsigned WidenWidth = WidenVT.getSizeInBits();
	unsigned WidenEltWidth = WidenEltVT.getSizeInBits();
	unsigned AlignInBits = Align*8;

	// If we have one element to load/store, return it.
	EVT RetVT = WidenEltVT;
	if (Width == WidenEltWidth)
	return RetVT;

	// See if there is larger legal integer than the element type to load/store.
	unsigned VT;
	for (VT = (unsigned)MVT::LAST_INTEGER_VALUETYPE;
	VT >= (unsigned)MVT::FIRST_INTEGER_VALUETYPE; --VT) {
	EVT MemVT((MVT::SimpleValueType) VT);
	unsigned MemVTWidth = MemVT.getSizeInBits();
	if (MemVT.getSizeInBits() <= WidenEltWidth)
	break;
	auto Action = TLI.getTypeAction(*DAG.getContext(), MemVT);
	if ((Action == TargetLowering::TypeLegal \|\|
	Action == TargetLowering::TypePromoteInteger) &&
	(WidenWidth % MemVTWidth) == 0 &&
	isPowerOf2_32(WidenWidth / MemVTWidth) &&
	(MemVTWidth <= Width \|\|
	(Align!=0 && MemVTWidth<=AlignInBits && MemVTWidth<=Width+WidenEx))) {
	RetVT = MemVT;
	break;
	}
	}

	// See if there is a larger vector type to load/store that has the same vector
	// element type and is evenly divisible with the WidenVT.
	for (VT = (unsigned)MVT::LAST_VECTOR_VALUETYPE;
	VT >= (unsigned)MVT::FIRST_VECTOR_VALUETYPE; --VT) {
	EVT MemVT = (MVT::SimpleValueType) VT;
	unsigned MemVTWidth = MemVT.getSizeInBits();
	if (TLI.isTypeLegal(MemVT) && WidenEltVT == MemVT.getVectorElementType() &&
	(WidenWidth % MemVTWidth) == 0 &&
	isPowerOf2_32(WidenWidth / MemVTWidth) &&
	(MemVTWidth <= Width \|\|
	(Align!=0 && MemVTWidth<=AlignInBits && MemVTWidth<=Width+WidenEx))) {
	if (RetVT.getSizeInBits() < MemVTWidth \|\| MemVT == WidenVT)
	return MemVT;
	}
	}

	return RetVT;
	}

	// Builds a vector type from scalar loads
	// VecTy: Resulting Vector type
	// LDOps: Load operators to build a vector type
	// [Start,End) the list of loads to use.
	static SDValue BuildVectorFromScalar(SelectionDAG& DAG, EVT VecTy,
	SmallVectorImpl<SDValue> &LdOps,
	unsigned Start, unsigned End) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDLoc dl(LdOps[Start]);
	EVT LdTy = LdOps[Start].getValueType();
	unsigned Width = VecTy.getSizeInBits();
	unsigned NumElts = Width / LdTy.getSizeInBits();
	EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), LdTy, NumElts);

	unsigned Idx = 1;
	SDValue VecOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NewVecVT,LdOps[Start]);

	for (unsigned i = Start + 1; i != End; ++i) {
	EVT NewLdTy = LdOps[i].getValueType();
	if (NewLdTy != LdTy) {
	NumElts = Width / NewLdTy.getSizeInBits();
	NewVecVT = EVT::getVectorVT(*DAG.getContext(), NewLdTy, NumElts);
	VecOp = DAG.getNode(ISD::BITCAST, dl, NewVecVT, VecOp);
	// Readjust position and vector position based on new load type.
	Idx = Idx * LdTy.getSizeInBits() / NewLdTy.getSizeInBits();
	LdTy = NewLdTy;
	}
	VecOp = DAG.getNode(
	ISD::INSERT_VECTOR_ELT, dl, NewVecVT, VecOp, LdOps[i],
	DAG.getConstant(Idx++, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
	}
	return DAG.getNode(ISD::BITCAST, dl, VecTy, VecOp);
	}

	SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
	LoadSDNode *LD) {
	// The strategy assumes that we can efficiently load power-of-two widths.
	// The routine chops the vector into the largest vector loads with the same
	// element type or scalar loads and then recombines it to the widen vector
	// type.
	EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(),LD->getValueType(0));
	unsigned WidenWidth = WidenVT.getSizeInBits();
	EVT LdVT = LD->getMemoryVT();
	SDLoc dl(LD);
	assert(LdVT.isVector() && WidenVT.isVector());
	assert(LdVT.getVectorElementType() == WidenVT.getVectorElementType());

	// Load information
	SDValue Chain = LD->getChain();
	SDValue BasePtr = LD->getBasePtr();
	unsigned Align = LD->getAlignment();
	MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
	AAMDNodes AAInfo = LD->getAAInfo();

	int LdWidth = LdVT.getSizeInBits();
	int WidthDiff = WidenWidth - LdWidth;
	unsigned LdAlign = LD->isVolatile() ? 0 : Align; // Allow wider loads.

	// Find the vector type that can load from.
	EVT NewVT = FindMemType(DAG, TLI, LdWidth, WidenVT, LdAlign, WidthDiff);
	int NewVTWidth = NewVT.getSizeInBits();
	SDValue LdOp = DAG.getLoad(NewVT, dl, Chain, BasePtr, LD->getPointerInfo(),
	Align, MMOFlags, AAInfo);
	LdChain.push_back(LdOp.getValue(1));

	// Check if we can load the element with one instruction.
	if (LdWidth <= NewVTWidth) {
	if (!NewVT.isVector()) {
	unsigned NumElts = WidenWidth / NewVTWidth;
	EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), NewVT, NumElts);
	SDValue VecOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NewVecVT, LdOp);
	return DAG.getNode(ISD::BITCAST, dl, WidenVT, VecOp);
	}
	if (NewVT == WidenVT)
	return LdOp;

	assert(WidenWidth % NewVTWidth == 0);
	unsigned NumConcat = WidenWidth / NewVTWidth;
	SmallVector<SDValue, 16> ConcatOps(NumConcat);
	SDValue UndefVal = DAG.getUNDEF(NewVT);
	ConcatOps[0] = LdOp;
	for (unsigned i = 1; i != NumConcat; ++i)
	ConcatOps[i] = UndefVal;
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, ConcatOps);
	}

	// Load vector by using multiple loads from largest vector to scalar.
	SmallVector<SDValue, 16> LdOps;
	LdOps.push_back(LdOp);

	LdWidth -= NewVTWidth;
	unsigned Offset = 0;

	while (LdWidth > 0) {
	unsigned Increment = NewVTWidth / 8;
	Offset += Increment;
	BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
	DAG.getConstant(Increment, dl, BasePtr.getValueType()));

	SDValue L;
	if (LdWidth < NewVTWidth) {
	// The current type we are using is too large. Find a better size.
	NewVT = FindMemType(DAG, TLI, LdWidth, WidenVT, LdAlign, WidthDiff);
	NewVTWidth = NewVT.getSizeInBits();
	L = DAG.getLoad(NewVT, dl, Chain, BasePtr,
	LD->getPointerInfo().getWithOffset(Offset),
	MinAlign(Align, Increment), MMOFlags, AAInfo);
	LdChain.push_back(L.getValue(1));
	if (L->getValueType(0).isVector() && NewVTWidth >= LdWidth) {
	// Later code assumes the vector loads produced will be mergeable, so we
	// must pad the final entry up to the previous width. Scalars are
	// combined separately.
	SmallVector<SDValue, 16> Loads;
	Loads.push_back(L);
	unsigned size = L->getValueSizeInBits(0);
	while (size < LdOp->getValueSizeInBits(0)) {
	Loads.push_back(DAG.getUNDEF(L->getValueType(0)));
	size += L->getValueSizeInBits(0);
	}
	L = DAG.getNode(ISD::CONCAT_VECTORS, dl, LdOp->getValueType(0), Loads);
	}
	} else {
	L = DAG.getLoad(NewVT, dl, Chain, BasePtr,
	LD->getPointerInfo().getWithOffset(Offset),
	MinAlign(Align, Increment), MMOFlags, AAInfo);
	LdChain.push_back(L.getValue(1));
	}

	LdOps.push_back(L);


	LdWidth -= NewVTWidth;
	}

	// Build the vector from the load operations.
	unsigned End = LdOps.size();
	if (!LdOps[0].getValueType().isVector())
	// All the loads are scalar loads.
	return BuildVectorFromScalar(DAG, WidenVT, LdOps, 0, End);

	// If the load contains vectors, build the vector using concat vector.
	// All of the vectors used to load are power-of-2, and the scalar loads can be
	// combined to make a power-of-2 vector.
	SmallVector<SDValue, 16> ConcatOps(End);
	int i = End - 1;
	int Idx = End;
	EVT LdTy = LdOps[i].getValueType();
	// First, combine the scalar loads to a vector.
	if (!LdTy.isVector()) {
	for (--i; i >= 0; --i) {
	LdTy = LdOps[i].getValueType();
	if (LdTy.isVector())
	break;
	}
	ConcatOps[--Idx] = BuildVectorFromScalar(DAG, LdTy, LdOps, i + 1, End);
	}
	ConcatOps[--Idx] = LdOps[i];
	for (--i; i >= 0; --i) {
	EVT NewLdTy = LdOps[i].getValueType();
	if (NewLdTy != LdTy) {
	// Create a larger vector.
	ConcatOps[End-1] = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewLdTy,
	makeArrayRef(&ConcatOps[Idx], End - Idx));
	Idx = End - 1;
	LdTy = NewLdTy;
	}
	ConcatOps[--Idx] = LdOps[i];
	}

	if (WidenWidth == LdTy.getSizeInBits() * (End - Idx))
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT,
	makeArrayRef(&ConcatOps[Idx], End - Idx));

	// We need to fill the rest with undefs to build the vector.
	unsigned NumOps = WidenWidth / LdTy.getSizeInBits();
	SmallVector<SDValue, 16> WidenOps(NumOps);
	SDValue UndefVal = DAG.getUNDEF(LdTy);
	{
	unsigned i = 0;
	for (; i != End-Idx; ++i)
	WidenOps[i] = ConcatOps[Idx+i];
	for (; i != NumOps; ++i)
	WidenOps[i] = UndefVal;
	}
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, WidenOps);
	}

	SDValue
	DAGTypeLegalizer::GenWidenVectorExtLoads(SmallVectorImpl<SDValue> &LdChain,
	LoadSDNode *LD,
	ISD::LoadExtType ExtType) {
	// For extension loads, it may not be more efficient to chop up the vector
	// and then extend it. Instead, we unroll the load and build a new vector.
	EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(),LD->getValueType(0));
	EVT LdVT = LD->getMemoryVT();
	SDLoc dl(LD);
	assert(LdVT.isVector() && WidenVT.isVector());

	// Load information
	SDValue Chain = LD->getChain();
	SDValue BasePtr = LD->getBasePtr();
	unsigned Align = LD->getAlignment();
	MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
	AAMDNodes AAInfo = LD->getAAInfo();

	EVT EltVT = WidenVT.getVectorElementType();
	EVT LdEltVT = LdVT.getVectorElementType();
	unsigned NumElts = LdVT.getVectorNumElements();

	// Load each element and widen.
	unsigned WidenNumElts = WidenVT.getVectorNumElements();
	SmallVector<SDValue, 16> Ops(WidenNumElts);
	unsigned Increment = LdEltVT.getSizeInBits() / 8;
	Ops[0] =
	DAG.getExtLoad(ExtType, dl, EltVT, Chain, BasePtr, LD->getPointerInfo(),
	LdEltVT, Align, MMOFlags, AAInfo);
	LdChain.push_back(Ops[0].getValue(1));
	unsigned i = 0, Offset = Increment;
	for (i=1; i < NumElts; ++i, Offset += Increment) {
	SDValue NewBasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(),
	BasePtr,
	DAG.getConstant(Offset, dl,
	BasePtr.getValueType()));
	Ops[i] = DAG.getExtLoad(ExtType, dl, EltVT, Chain, NewBasePtr,
	LD->getPointerInfo().getWithOffset(Offset), LdEltVT,
	Align, MMOFlags, AAInfo);
	LdChain.push_back(Ops[i].getValue(1));
	}

	// Fill the rest with undefs.
	SDValue UndefVal = DAG.getUNDEF(EltVT);
	for (; i != WidenNumElts; ++i)
	Ops[i] = UndefVal;

	return DAG.getBuildVector(WidenVT, dl, Ops);
	}

	void DAGTypeLegalizer::GenWidenVectorStores(SmallVectorImpl<SDValue> &StChain,
	StoreSDNode *ST) {
	// The strategy assumes that we can efficiently store power-of-two widths.
	// The routine chops the vector into the largest vector stores with the same
	// element type or scalar stores.
	SDValue Chain = ST->getChain();
	SDValue BasePtr = ST->getBasePtr();
	unsigned Align = ST->getAlignment();
	MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags();
	AAMDNodes AAInfo = ST->getAAInfo();
	SDValue ValOp = GetWidenedVector(ST->getValue());
	SDLoc dl(ST);

	EVT StVT = ST->getMemoryVT();
	unsigned StWidth = StVT.getSizeInBits();
	EVT ValVT = ValOp.getValueType();
	unsigned ValWidth = ValVT.getSizeInBits();
	EVT ValEltVT = ValVT.getVectorElementType();
	unsigned ValEltWidth = ValEltVT.getSizeInBits();
	assert(StVT.getVectorElementType() == ValEltVT);

	int Idx = 0; // current index to store
	unsigned Offset = 0; // offset from base to store
	while (StWidth != 0) {
	// Find the largest vector type we can store with.
	EVT NewVT = FindMemType(DAG, TLI, StWidth, ValVT);
	unsigned NewVTWidth = NewVT.getSizeInBits();
	unsigned Increment = NewVTWidth / 8;
	if (NewVT.isVector()) {
	unsigned NumVTElts = NewVT.getVectorNumElements();
	do {
	SDValue EOp = DAG.getNode(
	ISD::EXTRACT_SUBVECTOR, dl, NewVT, ValOp,
	DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
	StChain.push_back(DAG.getStore(
	Chain, dl, EOp, BasePtr, ST->getPointerInfo().getWithOffset(Offset),
	MinAlign(Align, Offset), MMOFlags, AAInfo));
	StWidth -= NewVTWidth;
	Offset += Increment;
	Idx += NumVTElts;
	BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
	DAG.getConstant(Increment, dl,
	BasePtr.getValueType()));
	} while (StWidth != 0 && StWidth >= NewVTWidth);
	} else {
	// Cast the vector to the scalar type we can store.
	unsigned NumElts = ValWidth / NewVTWidth;
	EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), NewVT, NumElts);
	SDValue VecOp = DAG.getNode(ISD::BITCAST, dl, NewVecVT, ValOp);
	// Readjust index position based on new vector type.
	Idx = Idx * ValEltWidth / NewVTWidth;
	do {
	SDValue EOp = DAG.getNode(
	ISD::EXTRACT_VECTOR_ELT, dl, NewVT, VecOp,
	DAG.getConstant(Idx++, dl,
	TLI.getVectorIdxTy(DAG.getDataLayout())));
	StChain.push_back(DAG.getStore(
	Chain, dl, EOp, BasePtr, ST->getPointerInfo().getWithOffset(Offset),
	MinAlign(Align, Offset), MMOFlags, AAInfo));
	StWidth -= NewVTWidth;
	Offset += Increment;
	BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
	DAG.getConstant(Increment, dl,
	BasePtr.getValueType()));
	} while (StWidth != 0 && StWidth >= NewVTWidth);
	// Restore index back to be relative to the original widen element type.
	Idx = Idx * NewVTWidth / ValEltWidth;
	}
	}
	}

	void
	DAGTypeLegalizer::GenWidenVectorTruncStores(SmallVectorImpl<SDValue> &StChain,
	StoreSDNode *ST) {
	// For extension loads, it may not be more efficient to truncate the vector
	// and then store it. Instead, we extract each element and then store it.
	SDValue Chain = ST->getChain();
	SDValue BasePtr = ST->getBasePtr();
	unsigned Align = ST->getAlignment();
	MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags();
	AAMDNodes AAInfo = ST->getAAInfo();
	SDValue ValOp = GetWidenedVector(ST->getValue());
	SDLoc dl(ST);

	EVT StVT = ST->getMemoryVT();
	EVT ValVT = ValOp.getValueType();

	// It must be true that the wide vector type is bigger than where we need to
	// store.
	assert(StVT.isVector() && ValOp.getValueType().isVector());
	assert(StVT.bitsLT(ValOp.getValueType()));

	// For truncating stores, we can not play the tricks of chopping legal vector
	// types and bitcast it to the right type. Instead, we unroll the store.
	EVT StEltVT = StVT.getVectorElementType();
	EVT ValEltVT = ValVT.getVectorElementType();
	unsigned Increment = ValEltVT.getSizeInBits() / 8;
	unsigned NumElts = StVT.getVectorNumElements();
	SDValue EOp = DAG.getNode(
	ISD::EXTRACT_VECTOR_ELT, dl, ValEltVT, ValOp,
	DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
	StChain.push_back(DAG.getTruncStore(Chain, dl, EOp, BasePtr,
	ST->getPointerInfo(), StEltVT, Align,
	MMOFlags, AAInfo));
	unsigned Offset = Increment;
	for (unsigned i=1; i < NumElts; ++i, Offset += Increment) {
	SDValue NewBasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(),
	BasePtr,
	DAG.getConstant(Offset, dl,
	BasePtr.getValueType()));
	SDValue EOp = DAG.getNode(
	ISD::EXTRACT_VECTOR_ELT, dl, ValEltVT, ValOp,
	DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
	StChain.push_back(DAG.getTruncStore(
	Chain, dl, EOp, NewBasePtr, ST->getPointerInfo().getWithOffset(Offset),
	StEltVT, MinAlign(Align, Offset), MMOFlags, AAInfo));
	}
	}

	/// Modifies a vector input (widen or narrows) to a vector of NVT. The
	/// input vector must have the same element type as NVT.
	/// FillWithZeroes specifies that the vector should be widened with zeroes.
	SDValue DAGTypeLegalizer::ModifyToType(SDValue InOp, EVT NVT,
	bool FillWithZeroes) {
	// Note that InOp might have been widened so it might already have
	// the right width or it might need be narrowed.
	EVT InVT = InOp.getValueType();
	assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
	"input and widen element type must match");
	SDLoc dl(InOp);

	// Check if InOp already has the right width.
	if (InVT == NVT)
	return InOp;

	unsigned InNumElts = InVT.getVectorNumElements();
	unsigned WidenNumElts = NVT.getVectorNumElements();
	if (WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0) {
	unsigned NumConcat = WidenNumElts / InNumElts;
	SmallVector<SDValue, 16> Ops(NumConcat);
	SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, InVT) :
	DAG.getUNDEF(InVT);
	Ops[0] = InOp;
	for (unsigned i = 1; i != NumConcat; ++i)
	Ops[i] = FillVal;

	return DAG.getNode(ISD::CONCAT_VECTORS, dl, NVT, Ops);
	}

	if (WidenNumElts < InNumElts && InNumElts % WidenNumElts)
	return DAG.getNode(
	ISD::EXTRACT_SUBVECTOR, dl, NVT, InOp,
	DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));

	// Fall back to extract and build.
	SmallVector<SDValue, 16> Ops(WidenNumElts);
	EVT EltVT = NVT.getVectorElementType();
	unsigned MinNumElts = std::min(WidenNumElts, InNumElts);
	unsigned Idx;
	for (Idx = 0; Idx < MinNumElts; ++Idx)
	Ops[Idx] = DAG.getNode(
	ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
	DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));

	SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
	DAG.getUNDEF(EltVT);
	for ( ; Idx < WidenNumElts; ++Idx)
	Ops[Idx] = FillVal;
	return DAG.getBuildVector(NVT, dl, Ops);
	}
	diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
	index 823e77850c4b..0ff154784f68 100644
	--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
	+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
	@@ -1,7957 +1,7958 @@
	//===- SelectionDAG.cpp - Implement the SelectionDAG data structures ------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This implements the SelectionDAG class.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/CodeGen/SelectionDAG.h"
	#include "SDNodeDbgValue.h"
	#include "llvm/ADT/APFloat.h"
	#include "llvm/ADT/APInt.h"
	#include "llvm/ADT/APSInt.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/BitVector.h"
	#include "llvm/ADT/FoldingSet.h"
	#include "llvm/ADT/None.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SmallPtrSet.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/Triple.h"
	#include "llvm/ADT/Twine.h"
	#include "llvm/Analysis/ValueTracking.h"
	#include "llvm/CodeGen/ISDOpcodes.h"
	#include "llvm/CodeGen/MachineBasicBlock.h"
	#include "llvm/CodeGen/MachineConstantPool.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineMemOperand.h"
	#include "llvm/CodeGen/MachineValueType.h"
	#include "llvm/CodeGen/RuntimeLibcalls.h"
	#include "llvm/CodeGen/SelectionDAGAddressAnalysis.h"
	#include "llvm/CodeGen/SelectionDAGNodes.h"
	#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
	#include "llvm/CodeGen/ValueTypes.h"
	#include "llvm/IR/Constant.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/DebugInfoMetadata.h"
	#include "llvm/IR/DebugLoc.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/GlobalValue.h"
	#include "llvm/IR/Metadata.h"
	#include "llvm/IR/Type.h"
	#include "llvm/IR/Value.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/CodeGen.h"
	#include "llvm/Support/Compiler.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/KnownBits.h"
	#include "llvm/Support/ManagedStatic.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Support/Mutex.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Target/TargetLowering.h"
	#include "llvm/Target/TargetMachine.h"
	#include "llvm/Target/TargetOptions.h"
	#include "llvm/Target/TargetRegisterInfo.h"
	#include "llvm/Target/TargetSubtargetInfo.h"
	#include <algorithm>
	#include <cassert>
	#include <cstdint>
	#include <cstdlib>
	#include <limits>
	#include <set>
	#include <string>
	#include <utility>
	#include <vector>

	using namespace llvm;

	/// makeVTList - Return an instance of the SDVTList struct initialized with the
	/// specified members.
	static SDVTList makeVTList(const EVT *VTs, unsigned NumVTs) {
	SDVTList Res = {VTs, NumVTs};
	return Res;
	}

	// Default null implementations of the callbacks.
	void SelectionDAG::DAGUpdateListener::NodeDeleted(SDNode, SDNode) {}
	void SelectionDAG::DAGUpdateListener::NodeUpdated(SDNode*) {}

	//===----------------------------------------------------------------------===//
	// ConstantFPSDNode Class
	//===----------------------------------------------------------------------===//

	/// isExactlyValue - We don't rely on operator== working on double values, as
	/// it returns true for things that are clearly not equal, like -0.0 and 0.0.
	/// As such, this method can be used to do an exact bit-for-bit comparison of
	/// two floating point values.
	bool ConstantFPSDNode::isExactlyValue(const APFloat& V) const {
	return getValueAPF().bitwiseIsEqual(V);
	}

	bool ConstantFPSDNode::isValueValidForType(EVT VT,
	const APFloat& Val) {
	assert(VT.isFloatingPoint() && "Can only convert between FP types");

	// convert modifies in place, so make a copy.
	APFloat Val2 = APFloat(Val);
	bool losesInfo;
	(void) Val2.convert(SelectionDAG::EVTToAPFloatSemantics(VT),
	APFloat::rmNearestTiesToEven,
	&losesInfo);
	return !losesInfo;
	}

	//===----------------------------------------------------------------------===//
	// ISD Namespace
	//===----------------------------------------------------------------------===//

	bool ISD::isConstantSplatVector(const SDNode *N, APInt &SplatVal) {
	auto *BV = dyn_cast<BuildVectorSDNode>(N);
	if (!BV)
	return false;

	APInt SplatUndef;
	unsigned SplatBitSize;
	bool HasUndefs;
	EVT EltVT = N->getValueType(0).getVectorElementType();
	return BV->isConstantSplat(SplatVal, SplatUndef, SplatBitSize, HasUndefs) &&
	EltVT.getSizeInBits() >= SplatBitSize;
	}

	// FIXME: AllOnes and AllZeros duplicate a lot of code. Could these be
	// specializations of the more general isConstantSplatVector()?

	bool ISD::isBuildVectorAllOnes(const SDNode *N) {
	// Look through a bit convert.
	while (N->getOpcode() == ISD::BITCAST)
	N = N->getOperand(0).getNode();

	if (N->getOpcode() != ISD::BUILD_VECTOR) return false;

	unsigned i = 0, e = N->getNumOperands();

	// Skip over all of the undef values.
	while (i != e && N->getOperand(i).isUndef())
	++i;

	// Do not accept an all-undef vector.
	if (i == e) return false;

	// Do not accept build_vectors that aren't all constants or which have non-~0
	// elements. We have to be a bit careful here, as the type of the constant
	// may not be the same as the type of the vector elements due to type
	// legalization (the elements are promoted to a legal type for the target and
	// a vector of a type may be legal when the base element type is not).
	// We only want to check enough bits to cover the vector elements, because
	// we care if the resultant vector is all ones, not whether the individual
	// constants are.
	SDValue NotZero = N->getOperand(i);
	unsigned EltSize = N->getValueType(0).getScalarSizeInBits();
	if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(NotZero)) {
	if (CN->getAPIntValue().countTrailingOnes() < EltSize)
	return false;
	} else if (ConstantFPSDNode *CFPN = dyn_cast<ConstantFPSDNode>(NotZero)) {
	if (CFPN->getValueAPF().bitcastToAPInt().countTrailingOnes() < EltSize)
	return false;
	} else
	return false;

	// Okay, we have at least one ~0 value, check to see if the rest match or are
	// undefs. Even with the above element type twiddling, this should be OK, as
	// the same type legalization should have applied to all the elements.
	for (++i; i != e; ++i)
	if (N->getOperand(i) != NotZero && !N->getOperand(i).isUndef())
	return false;
	return true;
	}

	bool ISD::isBuildVectorAllZeros(const SDNode *N) {
	// Look through a bit convert.
	while (N->getOpcode() == ISD::BITCAST)
	N = N->getOperand(0).getNode();

	if (N->getOpcode() != ISD::BUILD_VECTOR) return false;

	bool IsAllUndef = true;
	for (const SDValue &Op : N->op_values()) {
	if (Op.isUndef())
	continue;
	IsAllUndef = false;
	// Do not accept build_vectors that aren't all constants or which have non-0
	// elements. We have to be a bit careful here, as the type of the constant
	// may not be the same as the type of the vector elements due to type
	// legalization (the elements are promoted to a legal type for the target
	// and a vector of a type may be legal when the base element type is not).
	// We only want to check enough bits to cover the vector elements, because
	// we care if the resultant vector is all zeros, not whether the individual
	// constants are.
	unsigned EltSize = N->getValueType(0).getScalarSizeInBits();
	if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Op)) {
	if (CN->getAPIntValue().countTrailingZeros() < EltSize)
	return false;
	} else if (ConstantFPSDNode *CFPN = dyn_cast<ConstantFPSDNode>(Op)) {
	if (CFPN->getValueAPF().bitcastToAPInt().countTrailingZeros() < EltSize)
	return false;
	} else
	return false;
	}

	// Do not accept an all-undef vector.
	if (IsAllUndef)
	return false;
	return true;
	}

	bool ISD::isBuildVectorOfConstantSDNodes(const SDNode *N) {
	if (N->getOpcode() != ISD::BUILD_VECTOR)
	return false;

	for (const SDValue &Op : N->op_values()) {
	if (Op.isUndef())
	continue;
	if (!isa<ConstantSDNode>(Op))
	return false;
	}
	return true;
	}

	bool ISD::isBuildVectorOfConstantFPSDNodes(const SDNode *N) {
	if (N->getOpcode() != ISD::BUILD_VECTOR)
	return false;

	for (const SDValue &Op : N->op_values()) {
	if (Op.isUndef())
	continue;
	if (!isa<ConstantFPSDNode>(Op))
	return false;
	}
	return true;
	}

	bool ISD::allOperandsUndef(const SDNode *N) {
	// Return false if the node has no operands.
	// This is "logically inconsistent" with the definition of "all" but
	// is probably the desired behavior.
	if (N->getNumOperands() == 0)
	return false;

	for (const SDValue &Op : N->op_values())
	if (!Op.isUndef())
	return false;

	return true;
	}

	ISD::NodeType ISD::getExtForLoadExtType(bool IsFP, ISD::LoadExtType ExtType) {
	switch (ExtType) {
	case ISD::EXTLOAD:
	return IsFP ? ISD::FP_EXTEND : ISD::ANY_EXTEND;
	case ISD::SEXTLOAD:
	return ISD::SIGN_EXTEND;
	case ISD::ZEXTLOAD:
	return ISD::ZERO_EXTEND;
	default:
	break;
	}

	llvm_unreachable("Invalid LoadExtType");
	}

	ISD::CondCode ISD::getSetCCSwappedOperands(ISD::CondCode Operation) {
	// To perform this operation, we just need to swap the L and G bits of the
	// operation.
	unsigned OldL = (Operation >> 2) & 1;
	unsigned OldG = (Operation >> 1) & 1;
	return ISD::CondCode((Operation & ~6) \| // Keep the N, U, E bits
	(OldL << 1) \| // New G bit
	(OldG << 2)); // New L bit.
	}

	ISD::CondCode ISD::getSetCCInverse(ISD::CondCode Op, bool isInteger) {
	unsigned Operation = Op;
	if (isInteger)
	Operation ^= 7; // Flip L, G, E bits, but not U.
	else
	Operation ^= 15; // Flip all of the condition bits.

	if (Operation > ISD::SETTRUE2)
	Operation &= ~8; // Don't let N and U bits get set.

	return ISD::CondCode(Operation);
	}

	/// For an integer comparison, return 1 if the comparison is a signed operation
	/// and 2 if the result is an unsigned comparison. Return zero if the operation
	/// does not depend on the sign of the input (setne and seteq).
	static int isSignedOp(ISD::CondCode Opcode) {
	switch (Opcode) {
	default: llvm_unreachable("Illegal integer setcc operation!");
	case ISD::SETEQ:
	case ISD::SETNE: return 0;
	case ISD::SETLT:
	case ISD::SETLE:
	case ISD::SETGT:
	case ISD::SETGE: return 1;
	case ISD::SETULT:
	case ISD::SETULE:
	case ISD::SETUGT:
	case ISD::SETUGE: return 2;
	}
	}

	ISD::CondCode ISD::getSetCCOrOperation(ISD::CondCode Op1, ISD::CondCode Op2,
	bool IsInteger) {
	if (IsInteger && (isSignedOp(Op1) \| isSignedOp(Op2)) == 3)
	// Cannot fold a signed integer setcc with an unsigned integer setcc.
	return ISD::SETCC_INVALID;

	unsigned Op = Op1 \| Op2; // Combine all of the condition bits.

	// If the N and U bits get set, then the resultant comparison DOES suddenly
	// care about orderedness, and it is true when ordered.
	if (Op > ISD::SETTRUE2)
	Op &= ~16; // Clear the U bit if the N bit is set.

	// Canonicalize illegal integer setcc's.
	if (IsInteger && Op == ISD::SETUNE) // e.g. SETUGT \| SETULT
	Op = ISD::SETNE;

	return ISD::CondCode(Op);
	}

	ISD::CondCode ISD::getSetCCAndOperation(ISD::CondCode Op1, ISD::CondCode Op2,
	bool IsInteger) {
	if (IsInteger && (isSignedOp(Op1) \| isSignedOp(Op2)) == 3)
	// Cannot fold a signed setcc with an unsigned setcc.
	return ISD::SETCC_INVALID;

	// Combine all of the condition bits.
	ISD::CondCode Result = ISD::CondCode(Op1 & Op2);

	// Canonicalize illegal integer setcc's.
	if (IsInteger) {
	switch (Result) {
	default: break;
	case ISD::SETUO : Result = ISD::SETFALSE; break; // SETUGT & SETULT
	case ISD::SETOEQ: // SETEQ & SETU[LG]E
	case ISD::SETUEQ: Result = ISD::SETEQ ; break; // SETUGE & SETULE
	case ISD::SETOLT: Result = ISD::SETULT ; break; // SETULT & SETNE
	case ISD::SETOGT: Result = ISD::SETUGT ; break; // SETUGT & SETNE
	}
	}

	return Result;
	}

	//===----------------------------------------------------------------------===//
	// SDNode Profile Support
	//===----------------------------------------------------------------------===//

	/// AddNodeIDOpcode - Add the node opcode to the NodeID data.
	static void AddNodeIDOpcode(FoldingSetNodeID &ID, unsigned OpC) {
	ID.AddInteger(OpC);
	}

	/// AddNodeIDValueTypes - Value type lists are intern'd so we can represent them
	/// solely with their pointer.
	static void AddNodeIDValueTypes(FoldingSetNodeID &ID, SDVTList VTList) {
	ID.AddPointer(VTList.VTs);
	}

	/// AddNodeIDOperands - Various routines for adding operands to the NodeID data.
	static void AddNodeIDOperands(FoldingSetNodeID &ID,
	ArrayRef<SDValue> Ops) {
	for (auto& Op : Ops) {
	ID.AddPointer(Op.getNode());
	ID.AddInteger(Op.getResNo());
	}
	}

	/// AddNodeIDOperands - Various routines for adding operands to the NodeID data.
	static void AddNodeIDOperands(FoldingSetNodeID &ID,
	ArrayRef<SDUse> Ops) {
	for (auto& Op : Ops) {
	ID.AddPointer(Op.getNode());
	ID.AddInteger(Op.getResNo());
	}
	}

	static void AddNodeIDNode(FoldingSetNodeID &ID, unsigned short OpC,
	SDVTList VTList, ArrayRef<SDValue> OpList) {
	AddNodeIDOpcode(ID, OpC);
	AddNodeIDValueTypes(ID, VTList);
	AddNodeIDOperands(ID, OpList);
	}

	/// If this is an SDNode with special info, add this info to the NodeID data.
	static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) {
	switch (N->getOpcode()) {
	case ISD::TargetExternalSymbol:
	case ISD::ExternalSymbol:
	case ISD::MCSymbol:
	llvm_unreachable("Should only be used on nodes with operands");
	default: break; // Normal nodes don't need extra info.
	case ISD::TargetConstant:
	case ISD::Constant: {
	const ConstantSDNode *C = cast<ConstantSDNode>(N);
	ID.AddPointer(C->getConstantIntValue());
	ID.AddBoolean(C->isOpaque());
	break;
	}
	case ISD::TargetConstantFP:
	case ISD::ConstantFP:
	ID.AddPointer(cast<ConstantFPSDNode>(N)->getConstantFPValue());
	break;
	case ISD::TargetGlobalAddress:
	case ISD::GlobalAddress:
	case ISD::TargetGlobalTLSAddress:
	case ISD::GlobalTLSAddress: {
	const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(N);
	ID.AddPointer(GA->getGlobal());
	ID.AddInteger(GA->getOffset());
	ID.AddInteger(GA->getTargetFlags());
	break;
	}
	case ISD::BasicBlock:
	ID.AddPointer(cast<BasicBlockSDNode>(N)->getBasicBlock());
	break;
	case ISD::Register:
	ID.AddInteger(cast<RegisterSDNode>(N)->getReg());
	break;
	case ISD::RegisterMask:
	ID.AddPointer(cast<RegisterMaskSDNode>(N)->getRegMask());
	break;
	case ISD::SRCVALUE:
	ID.AddPointer(cast<SrcValueSDNode>(N)->getValue());
	break;
	case ISD::FrameIndex:
	case ISD::TargetFrameIndex:
	ID.AddInteger(cast<FrameIndexSDNode>(N)->getIndex());
	break;
	case ISD::JumpTable:
	case ISD::TargetJumpTable:
	ID.AddInteger(cast<JumpTableSDNode>(N)->getIndex());
	ID.AddInteger(cast<JumpTableSDNode>(N)->getTargetFlags());
	break;
	case ISD::ConstantPool:
	case ISD::TargetConstantPool: {
	const ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(N);
	ID.AddInteger(CP->getAlignment());
	ID.AddInteger(CP->getOffset());
	if (CP->isMachineConstantPoolEntry())
	CP->getMachineCPVal()->addSelectionDAGCSEId(ID);
	else
	ID.AddPointer(CP->getConstVal());
	ID.AddInteger(CP->getTargetFlags());
	break;
	}
	case ISD::TargetIndex: {
	const TargetIndexSDNode *TI = cast<TargetIndexSDNode>(N);
	ID.AddInteger(TI->getIndex());
	ID.AddInteger(TI->getOffset());
	ID.AddInteger(TI->getTargetFlags());
	break;
	}
	case ISD::LOAD: {
	const LoadSDNode *LD = cast<LoadSDNode>(N);
	ID.AddInteger(LD->getMemoryVT().getRawBits());
	ID.AddInteger(LD->getRawSubclassData());
	ID.AddInteger(LD->getPointerInfo().getAddrSpace());
	break;
	}
	case ISD::STORE: {
	const StoreSDNode *ST = cast<StoreSDNode>(N);
	ID.AddInteger(ST->getMemoryVT().getRawBits());
	ID.AddInteger(ST->getRawSubclassData());
	ID.AddInteger(ST->getPointerInfo().getAddrSpace());
	break;
	}
	case ISD::ATOMIC_CMP_SWAP:
	case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
	case ISD::ATOMIC_SWAP:
	case ISD::ATOMIC_LOAD_ADD:
	case ISD::ATOMIC_LOAD_SUB:
	case ISD::ATOMIC_LOAD_AND:
	case ISD::ATOMIC_LOAD_OR:
	case ISD::ATOMIC_LOAD_XOR:
	case ISD::ATOMIC_LOAD_NAND:
	case ISD::ATOMIC_LOAD_MIN:
	case ISD::ATOMIC_LOAD_MAX:
	case ISD::ATOMIC_LOAD_UMIN:
	case ISD::ATOMIC_LOAD_UMAX:
	case ISD::ATOMIC_LOAD:
	case ISD::ATOMIC_STORE: {
	const AtomicSDNode *AT = cast<AtomicSDNode>(N);
	ID.AddInteger(AT->getMemoryVT().getRawBits());
	ID.AddInteger(AT->getRawSubclassData());
	ID.AddInteger(AT->getPointerInfo().getAddrSpace());
	break;
	}
	case ISD::PREFETCH: {
	const MemSDNode *PF = cast<MemSDNode>(N);
	ID.AddInteger(PF->getPointerInfo().getAddrSpace());
	break;
	}
	case ISD::VECTOR_SHUFFLE: {
	const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
	for (unsigned i = 0, e = N->getValueType(0).getVectorNumElements();
	i != e; ++i)
	ID.AddInteger(SVN->getMaskElt(i));
	break;
	}
	case ISD::TargetBlockAddress:
	case ISD::BlockAddress: {
	const BlockAddressSDNode *BA = cast<BlockAddressSDNode>(N);
	ID.AddPointer(BA->getBlockAddress());
	ID.AddInteger(BA->getOffset());
	ID.AddInteger(BA->getTargetFlags());
	break;
	}
	} // end switch (N->getOpcode())

	// Target specific memory nodes could also have address spaces to check.
	if (N->isTargetMemoryOpcode())
	ID.AddInteger(cast<MemSDNode>(N)->getPointerInfo().getAddrSpace());
	}

	/// AddNodeIDNode - Generic routine for adding a nodes info to the NodeID
	/// data.
	static void AddNodeIDNode(FoldingSetNodeID &ID, const SDNode *N) {
	AddNodeIDOpcode(ID, N->getOpcode());
	// Add the return value info.
	AddNodeIDValueTypes(ID, N->getVTList());
	// Add the operand info.
	AddNodeIDOperands(ID, N->ops());

	// Handle SDNode leafs with special info.
	AddNodeIDCustom(ID, N);
	}

	//===----------------------------------------------------------------------===//
	// SelectionDAG Class
	//===----------------------------------------------------------------------===//

	/// doNotCSE - Return true if CSE should not be performed for this node.
	static bool doNotCSE(SDNode *N) {
	if (N->getValueType(0) == MVT::Glue)
	return true; // Never CSE anything that produces a flag.

	switch (N->getOpcode()) {
	default: break;
	case ISD::HANDLENODE:
	case ISD::EH_LABEL:
	return true; // Never CSE these nodes.
	}

	// Check that remaining values produced are not flags.
	for (unsigned i = 1, e = N->getNumValues(); i != e; ++i)
	if (N->getValueType(i) == MVT::Glue)
	return true; // Never CSE anything that produces a flag.

	return false;
	}

	/// RemoveDeadNodes - This method deletes all unreachable nodes in the
	/// SelectionDAG.
	void SelectionDAG::RemoveDeadNodes() {
	// Create a dummy node (which is not added to allnodes), that adds a reference
	// to the root node, preventing it from being deleted.
	HandleSDNode Dummy(getRoot());

	SmallVector<SDNode*, 128> DeadNodes;

	// Add all obviously-dead nodes to the DeadNodes worklist.
	for (SDNode &Node : allnodes())
	if (Node.use_empty())
	DeadNodes.push_back(&Node);

	RemoveDeadNodes(DeadNodes);

	// If the root changed (e.g. it was a dead load, update the root).
	setRoot(Dummy.getValue());
	}

	/// RemoveDeadNodes - This method deletes the unreachable nodes in the
	/// given list, and any nodes that become unreachable as a result.
	void SelectionDAG::RemoveDeadNodes(SmallVectorImpl<SDNode *> &DeadNodes) {

	// Process the worklist, deleting the nodes and adding their uses to the
	// worklist.
	while (!DeadNodes.empty()) {
	SDNode *N = DeadNodes.pop_back_val();
	// Skip to next node if we've already managed to delete the node. This could
	// happen if replacing a node causes a node previously added to the node to
	// be deleted.
	if (N->getOpcode() == ISD::DELETED_NODE)
	continue;

	for (DAGUpdateListener *DUL = UpdateListeners; DUL; DUL = DUL->Next)
	DUL->NodeDeleted(N, nullptr);

	// Take the node out of the appropriate CSE map.
	RemoveNodeFromCSEMaps(N);

	// Next, brutally remove the operand list. This is safe to do, as there are
	// no cycles in the graph.
	for (SDNode::op_iterator I = N->op_begin(), E = N->op_end(); I != E; ) {
	SDUse &Use = *I++;
	SDNode *Operand = Use.getNode();
	Use.set(SDValue());

	// Now that we removed this operand, see if there are no uses of it left.
	if (Operand->use_empty())
	DeadNodes.push_back(Operand);
	}

	DeallocateNode(N);
	}
	}

	void SelectionDAG::RemoveDeadNode(SDNode *N){
	SmallVector<SDNode*, 16> DeadNodes(1, N);

	// Create a dummy node that adds a reference to the root node, preventing
	// it from being deleted. (This matters if the root is an operand of the
	// dead node.)
	HandleSDNode Dummy(getRoot());

	RemoveDeadNodes(DeadNodes);
	}

	void SelectionDAG::DeleteNode(SDNode *N) {
	// First take this out of the appropriate CSE map.
	RemoveNodeFromCSEMaps(N);

	// Finally, remove uses due to operands of this node, remove from the
	// AllNodes list, and delete the node.
	DeleteNodeNotInCSEMaps(N);
	}

	void SelectionDAG::DeleteNodeNotInCSEMaps(SDNode *N) {
	assert(N->getIterator() != AllNodes.begin() &&
	"Cannot delete the entry node!");
	assert(N->use_empty() && "Cannot delete a node that is not dead!");

	// Drop all of the operands and decrement used node's use counts.
	N->DropOperands();

	DeallocateNode(N);
	}

	void SDDbgInfo::erase(const SDNode *Node) {
	DbgValMapType::iterator I = DbgValMap.find(Node);
	if (I == DbgValMap.end())
	return;
	for (auto &Val: I->second)
	Val->setIsInvalidated();
	DbgValMap.erase(I);
	}

	void SelectionDAG::DeallocateNode(SDNode *N) {
	// If we have operands, deallocate them.
	removeOperands(N);

	NodeAllocator.Deallocate(AllNodes.remove(N));

	// Set the opcode to DELETED_NODE to help catch bugs when node
	// memory is reallocated.
	// FIXME: There are places in SDag that have grown a dependency on the opcode
	// value in the released node.
	__asan_unpoison_memory_region(&N->NodeType, sizeof(N->NodeType));
	N->NodeType = ISD::DELETED_NODE;

	// If any of the SDDbgValue nodes refer to this SDNode, invalidate
	// them and forget about that node.
	DbgInfo->erase(N);
	}

	#ifndef NDEBUG
	/// VerifySDNode - Sanity check the given SDNode. Aborts if it is invalid.
	static void VerifySDNode(SDNode *N) {
	switch (N->getOpcode()) {
	default:
	break;
	case ISD::BUILD_PAIR: {
	EVT VT = N->getValueType(0);
	assert(N->getNumValues() == 1 && "Too many results!");
	assert(!VT.isVector() && (VT.isInteger() \|\| VT.isFloatingPoint()) &&
	"Wrong return type!");
	assert(N->getNumOperands() == 2 && "Wrong number of operands!");
	assert(N->getOperand(0).getValueType() == N->getOperand(1).getValueType() &&
	"Mismatched operand types!");
	assert(N->getOperand(0).getValueType().isInteger() == VT.isInteger() &&
	"Wrong operand type!");
	assert(VT.getSizeInBits() == 2 * N->getOperand(0).getValueSizeInBits() &&
	"Wrong return type size");
	break;
	}
	case ISD::BUILD_VECTOR: {
	assert(N->getNumValues() == 1 && "Too many results!");
	assert(N->getValueType(0).isVector() && "Wrong return type!");
	assert(N->getNumOperands() == N->getValueType(0).getVectorNumElements() &&
	"Wrong number of operands!");
	EVT EltVT = N->getValueType(0).getVectorElementType();
	for (SDNode::op_iterator I = N->op_begin(), E = N->op_end(); I != E; ++I) {
	assert((I->getValueType() == EltVT \|\|
	(EltVT.isInteger() && I->getValueType().isInteger() &&
	EltVT.bitsLE(I->getValueType()))) &&
	"Wrong operand type!");
	assert(I->getValueType() == N->getOperand(0).getValueType() &&
	"Operands must all have the same type");
	}
	break;
	}
	}
	}
	#endif // NDEBUG

	/// \brief Insert a newly allocated node into the DAG.
	///
	/// Handles insertion into the all nodes list and CSE map, as well as
	/// verification and other common operations when a new node is allocated.
	void SelectionDAG::InsertNode(SDNode *N) {
	AllNodes.push_back(N);
	#ifndef NDEBUG
	N->PersistentId = NextPersistentId++;
	VerifySDNode(N);
	#endif
	}

	/// RemoveNodeFromCSEMaps - Take the specified node out of the CSE map that
	/// correspond to it. This is useful when we're about to delete or repurpose
	/// the node. We don't want future request for structurally identical nodes
	/// to return N anymore.
	bool SelectionDAG::RemoveNodeFromCSEMaps(SDNode *N) {
	bool Erased = false;
	switch (N->getOpcode()) {
	case ISD::HANDLENODE: return false; // noop.
	case ISD::CONDCODE:
	assert(CondCodeNodes[cast<CondCodeSDNode>(N)->get()] &&
	"Cond code doesn't exist!");
	Erased = CondCodeNodes[cast<CondCodeSDNode>(N)->get()] != nullptr;
	CondCodeNodes[cast<CondCodeSDNode>(N)->get()] = nullptr;
	break;
	case ISD::ExternalSymbol:
	Erased = ExternalSymbols.erase(cast<ExternalSymbolSDNode>(N)->getSymbol());
	break;
	case ISD::TargetExternalSymbol: {
	ExternalSymbolSDNode *ESN = cast<ExternalSymbolSDNode>(N);
	Erased = TargetExternalSymbols.erase(
	std::pair<std::string,unsigned char>(ESN->getSymbol(),
	ESN->getTargetFlags()));
	break;
	}
	case ISD::MCSymbol: {
	auto *MCSN = cast<MCSymbolSDNode>(N);
	Erased = MCSymbols.erase(MCSN->getMCSymbol());
	break;
	}
	case ISD::VALUETYPE: {
	EVT VT = cast<VTSDNode>(N)->getVT();
	if (VT.isExtended()) {
	Erased = ExtendedValueTypeNodes.erase(VT);
	} else {
	Erased = ValueTypeNodes[VT.getSimpleVT().SimpleTy] != nullptr;
	ValueTypeNodes[VT.getSimpleVT().SimpleTy] = nullptr;
	}
	break;
	}
	default:
	// Remove it from the CSE Map.
	assert(N->getOpcode() != ISD::DELETED_NODE && "DELETED_NODE in CSEMap!");
	assert(N->getOpcode() != ISD::EntryToken && "EntryToken in CSEMap!");
	Erased = CSEMap.RemoveNode(N);
	break;
	}
	#ifndef NDEBUG
	// Verify that the node was actually in one of the CSE maps, unless it has a
	// flag result (which cannot be CSE'd) or is one of the special cases that are
	// not subject to CSE.
	if (!Erased && N->getValueType(N->getNumValues()-1) != MVT::Glue &&
	!N->isMachineOpcode() && !doNotCSE(N)) {
	N->dump(this);
	dbgs() << "\n";
	llvm_unreachable("Node is not in map!");
	}
	#endif
	return Erased;
	}

	/// AddModifiedNodeToCSEMaps - The specified node has been removed from the CSE
	/// maps and modified in place. Add it back to the CSE maps, unless an identical
	/// node already exists, in which case transfer all its users to the existing
	/// node. This transfer can potentially trigger recursive merging.
	void
	SelectionDAG::AddModifiedNodeToCSEMaps(SDNode *N) {
	// For node types that aren't CSE'd, just act as if no identical node
	// already exists.
	if (!doNotCSE(N)) {
	SDNode *Existing = CSEMap.GetOrInsertNode(N);
	if (Existing != N) {
	// If there was already an existing matching node, use ReplaceAllUsesWith
	// to replace the dead one with the existing one. This can cause
	// recursive merging of other unrelated nodes down the line.
	ReplaceAllUsesWith(N, Existing);

	// N is now dead. Inform the listeners and delete it.
	for (DAGUpdateListener *DUL = UpdateListeners; DUL; DUL = DUL->Next)
	DUL->NodeDeleted(N, Existing);
	DeleteNodeNotInCSEMaps(N);
	return;
	}
	}

	// If the node doesn't already exist, we updated it. Inform listeners.
	for (DAGUpdateListener *DUL = UpdateListeners; DUL; DUL = DUL->Next)
	DUL->NodeUpdated(N);
	}

	/// FindModifiedNodeSlot - Find a slot for the specified node if its operands
	/// were replaced with those specified. If this node is never memoized,
	/// return null, otherwise return a pointer to the slot it would take. If a
	/// node already exists with these operands, the slot will be non-null.
	SDNode SelectionDAG::FindModifiedNodeSlot(SDNode N, SDValue Op,
	void *&InsertPos) {
	if (doNotCSE(N))
	return nullptr;

	SDValue Ops[] = { Op };
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops);
	AddNodeIDCustom(ID, N);
	SDNode *Node = FindNodeOrInsertPos(ID, SDLoc(N), InsertPos);
	if (Node)
	Node->intersectFlagsWith(N->getFlags());
	return Node;
	}

	/// FindModifiedNodeSlot - Find a slot for the specified node if its operands
	/// were replaced with those specified. If this node is never memoized,
	/// return null, otherwise return a pointer to the slot it would take. If a
	/// node already exists with these operands, the slot will be non-null.
	SDNode SelectionDAG::FindModifiedNodeSlot(SDNode N,
	SDValue Op1, SDValue Op2,
	void *&InsertPos) {
	if (doNotCSE(N))
	return nullptr;

	SDValue Ops[] = { Op1, Op2 };
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops);
	AddNodeIDCustom(ID, N);
	SDNode *Node = FindNodeOrInsertPos(ID, SDLoc(N), InsertPos);
	if (Node)
	Node->intersectFlagsWith(N->getFlags());
	return Node;
	}

	/// FindModifiedNodeSlot - Find a slot for the specified node if its operands
	/// were replaced with those specified. If this node is never memoized,
	/// return null, otherwise return a pointer to the slot it would take. If a
	/// node already exists with these operands, the slot will be non-null.
	SDNode SelectionDAG::FindModifiedNodeSlot(SDNode N, ArrayRef<SDValue> Ops,
	void *&InsertPos) {
	if (doNotCSE(N))
	return nullptr;

	FoldingSetNodeID ID;
	AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops);
	AddNodeIDCustom(ID, N);
	SDNode *Node = FindNodeOrInsertPos(ID, SDLoc(N), InsertPos);
	if (Node)
	Node->intersectFlagsWith(N->getFlags());
	return Node;
	}

	unsigned SelectionDAG::getEVTAlignment(EVT VT) const {
	Type *Ty = VT == MVT::iPTR ?
	PointerType::get(Type::getInt8Ty(*getContext()), 0) :
	VT.getTypeForEVT(*getContext());

	return getDataLayout().getABITypeAlignment(Ty);
	}

	// EntryNode could meaningfully have debug info if we can find it...
	SelectionDAG::SelectionDAG(const TargetMachine &tm, CodeGenOpt::Level OL)
	: TM(tm), OptLevel(OL),
	EntryNode(ISD::EntryToken, 0, DebugLoc(), getVTList(MVT::Other)),
	Root(getEntryNode()) {
	InsertNode(&EntryNode);
	DbgInfo = new SDDbgInfo();
	}

	void SelectionDAG::init(MachineFunction &NewMF,
	OptimizationRemarkEmitter &NewORE) {
	MF = &NewMF;
	ORE = &NewORE;
	TLI = getSubtarget().getTargetLowering();
	TSI = getSubtarget().getSelectionDAGInfo();
	Context = &MF->getFunction()->getContext();
	}

	SelectionDAG::~SelectionDAG() {
	assert(!UpdateListeners && "Dangling registered DAGUpdateListeners");
	allnodes_clear();
	OperandRecycler.clear(OperandAllocator);
	delete DbgInfo;
	}

	void SelectionDAG::allnodes_clear() {
	assert(&*AllNodes.begin() == &EntryNode);
	AllNodes.remove(AllNodes.begin());
	while (!AllNodes.empty())
	DeallocateNode(&AllNodes.front());
	#ifndef NDEBUG
	NextPersistentId = 0;
	#endif
	}

	SDNode *SelectionDAG::FindNodeOrInsertPos(const FoldingSetNodeID &ID,
	void *&InsertPos) {
	SDNode *N = CSEMap.FindNodeOrInsertPos(ID, InsertPos);
	if (N) {
	switch (N->getOpcode()) {
	default: break;
	case ISD::Constant:
	case ISD::ConstantFP:
	llvm_unreachable("Querying for Constant and ConstantFP nodes requires "
	"debug location. Use another overload.");
	}
	}
	return N;
	}

	SDNode *SelectionDAG::FindNodeOrInsertPos(const FoldingSetNodeID &ID,
	const SDLoc &DL, void *&InsertPos) {
	SDNode *N = CSEMap.FindNodeOrInsertPos(ID, InsertPos);
	if (N) {
	switch (N->getOpcode()) {
	case ISD::Constant:
	case ISD::ConstantFP:
	// Erase debug location from the node if the node is used at several
	// different places. Do not propagate one location to all uses as it
	// will cause a worse single stepping debugging experience.
	if (N->getDebugLoc() != DL.getDebugLoc())
	N->setDebugLoc(DebugLoc());
	break;
	default:
	// When the node's point of use is located earlier in the instruction
	// sequence than its prior point of use, update its debug info to the
	// earlier location.
	if (DL.getIROrder() && DL.getIROrder() < N->getIROrder())
	N->setDebugLoc(DL.getDebugLoc());
	break;
	}
	}
	return N;
	}

	void SelectionDAG::clear() {
	allnodes_clear();
	OperandRecycler.clear(OperandAllocator);
	OperandAllocator.Reset();
	CSEMap.clear();

	ExtendedValueTypeNodes.clear();
	ExternalSymbols.clear();
	TargetExternalSymbols.clear();
	MCSymbols.clear();
	std::fill(CondCodeNodes.begin(), CondCodeNodes.end(),
	static_cast<CondCodeSDNode*>(nullptr));
	std::fill(ValueTypeNodes.begin(), ValueTypeNodes.end(),
	static_cast<SDNode*>(nullptr));

	EntryNode.UseList = nullptr;
	InsertNode(&EntryNode);
	Root = getEntryNode();
	DbgInfo->clear();
	}

	SDValue SelectionDAG::getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT) {
	return VT.bitsGT(Op.getValueType())
	? getNode(ISD::FP_EXTEND, DL, VT, Op)
	: getNode(ISD::FP_ROUND, DL, VT, Op, getIntPtrConstant(0, DL));
	}

	SDValue SelectionDAG::getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT) {
	return VT.bitsGT(Op.getValueType()) ?
	getNode(ISD::ANY_EXTEND, DL, VT, Op) :
	getNode(ISD::TRUNCATE, DL, VT, Op);
	}

	SDValue SelectionDAG::getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT) {
	return VT.bitsGT(Op.getValueType()) ?
	getNode(ISD::SIGN_EXTEND, DL, VT, Op) :
	getNode(ISD::TRUNCATE, DL, VT, Op);
	}

	SDValue SelectionDAG::getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT) {
	return VT.bitsGT(Op.getValueType()) ?
	getNode(ISD::ZERO_EXTEND, DL, VT, Op) :
	getNode(ISD::TRUNCATE, DL, VT, Op);
	}

	SDValue SelectionDAG::getBoolExtOrTrunc(SDValue Op, const SDLoc &SL, EVT VT,
	EVT OpVT) {
	if (VT.bitsLE(Op.getValueType()))
	return getNode(ISD::TRUNCATE, SL, VT, Op);

	TargetLowering::BooleanContent BType = TLI->getBooleanContents(OpVT);
	return getNode(TLI->getExtendForContent(BType), SL, VT, Op);
	}

	SDValue SelectionDAG::getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT) {
	assert(!VT.isVector() &&
	"getZeroExtendInReg should use the vector element type instead of "
	"the vector type!");
	if (Op.getValueType() == VT) return Op;
	unsigned BitWidth = Op.getScalarValueSizeInBits();
	APInt Imm = APInt::getLowBitsSet(BitWidth,
	VT.getSizeInBits());
	return getNode(ISD::AND, DL, Op.getValueType(), Op,
	getConstant(Imm, DL, Op.getValueType()));
	}

	SDValue SelectionDAG::getAnyExtendVectorInReg(SDValue Op, const SDLoc &DL,
	EVT VT) {
	assert(VT.isVector() && "This DAG node is restricted to vector types.");
	assert(VT.getSizeInBits() == Op.getValueSizeInBits() &&
	"The sizes of the input and result must match in order to perform the "
	"extend in-register.");
	assert(VT.getVectorNumElements() < Op.getValueType().getVectorNumElements() &&
	"The destination vector type must have fewer lanes than the input.");
	return getNode(ISD::ANY_EXTEND_VECTOR_INREG, DL, VT, Op);
	}

	SDValue SelectionDAG::getSignExtendVectorInReg(SDValue Op, const SDLoc &DL,
	EVT VT) {
	assert(VT.isVector() && "This DAG node is restricted to vector types.");
	assert(VT.getSizeInBits() == Op.getValueSizeInBits() &&
	"The sizes of the input and result must match in order to perform the "
	"extend in-register.");
	assert(VT.getVectorNumElements() < Op.getValueType().getVectorNumElements() &&
	"The destination vector type must have fewer lanes than the input.");
	return getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, VT, Op);
	}

	SDValue SelectionDAG::getZeroExtendVectorInReg(SDValue Op, const SDLoc &DL,
	EVT VT) {
	assert(VT.isVector() && "This DAG node is restricted to vector types.");
	assert(VT.getSizeInBits() == Op.getValueSizeInBits() &&
	"The sizes of the input and result must match in order to perform the "
	"extend in-register.");
	assert(VT.getVectorNumElements() < Op.getValueType().getVectorNumElements() &&
	"The destination vector type must have fewer lanes than the input.");
	return getNode(ISD::ZERO_EXTEND_VECTOR_INREG, DL, VT, Op);
	}

	/// getNOT - Create a bitwise NOT operation as (XOR Val, -1).
	SDValue SelectionDAG::getNOT(const SDLoc &DL, SDValue Val, EVT VT) {
	EVT EltVT = VT.getScalarType();
	SDValue NegOne =
	getConstant(APInt::getAllOnesValue(EltVT.getSizeInBits()), DL, VT);
	return getNode(ISD::XOR, DL, VT, Val, NegOne);
	}

	SDValue SelectionDAG::getLogicalNOT(const SDLoc &DL, SDValue Val, EVT VT) {
	EVT EltVT = VT.getScalarType();
	SDValue TrueValue;
	switch (TLI->getBooleanContents(VT)) {
	case TargetLowering::ZeroOrOneBooleanContent:
	case TargetLowering::UndefinedBooleanContent:
	TrueValue = getConstant(1, DL, VT);
	break;
	case TargetLowering::ZeroOrNegativeOneBooleanContent:
	TrueValue = getConstant(APInt::getAllOnesValue(EltVT.getSizeInBits()), DL,
	VT);
	break;
	}
	return getNode(ISD::XOR, DL, VT, Val, TrueValue);
	}

	SDValue SelectionDAG::getConstant(uint64_t Val, const SDLoc &DL, EVT VT,
	bool isT, bool isO) {
	EVT EltVT = VT.getScalarType();
	assert((EltVT.getSizeInBits() >= 64 \|\|
	(uint64_t)((int64_t)Val >> EltVT.getSizeInBits()) + 1 < 2) &&
	"getConstant with a uint64_t value that doesn't fit in the type!");
	return getConstant(APInt(EltVT.getSizeInBits(), Val), DL, VT, isT, isO);
	}

	SDValue SelectionDAG::getConstant(const APInt &Val, const SDLoc &DL, EVT VT,
	bool isT, bool isO) {
	return getConstant(ConstantInt::get(Context, Val), DL, VT, isT, isO);
	}

	SDValue SelectionDAG::getConstant(const ConstantInt &Val, const SDLoc &DL,
	EVT VT, bool isT, bool isO) {
	assert(VT.isInteger() && "Cannot create FP integer constant!");

	EVT EltVT = VT.getScalarType();
	const ConstantInt *Elt = &Val;

	// In some cases the vector type is legal but the element type is illegal and
	// needs to be promoted, for example v8i8 on ARM. In this case, promote the
	// inserted value (the type does not need to match the vector element type).
	// Any extra bits introduced will be truncated away.
	if (VT.isVector() && TLI->getTypeAction(*getContext(), EltVT) ==
	TargetLowering::TypePromoteInteger) {
	EltVT = TLI->getTypeToTransformTo(*getContext(), EltVT);
	APInt NewVal = Elt->getValue().zextOrTrunc(EltVT.getSizeInBits());
	Elt = ConstantInt::get(*getContext(), NewVal);
	}
	// In other cases the element type is illegal and needs to be expanded, for
	// example v2i64 on MIPS32. In this case, find the nearest legal type, split
	// the value into n parts and use a vector type with n-times the elements.
	// Then bitcast to the type requested.
	// Legalizing constants too early makes the DAGCombiner's job harder so we
	// only legalize if the DAG tells us we must produce legal types.
	else if (NewNodesMustHaveLegalTypes && VT.isVector() &&
	TLI->getTypeAction(*getContext(), EltVT) ==
	TargetLowering::TypeExpandInteger) {
	const APInt &NewVal = Elt->getValue();
	EVT ViaEltVT = TLI->getTypeToTransformTo(*getContext(), EltVT);
	unsigned ViaEltSizeInBits = ViaEltVT.getSizeInBits();
	unsigned ViaVecNumElts = VT.getSizeInBits() / ViaEltSizeInBits;
	EVT ViaVecVT = EVT::getVectorVT(*getContext(), ViaEltVT, ViaVecNumElts);

	// Check the temporary vector is the correct size. If this fails then
	// getTypeToTransformTo() probably returned a type whose size (in bits)
	// isn't a power-of-2 factor of the requested type size.
	assert(ViaVecVT.getSizeInBits() == VT.getSizeInBits());

	SmallVector<SDValue, 2> EltParts;
	for (unsigned i = 0; i < ViaVecNumElts / VT.getVectorNumElements(); ++i) {
	EltParts.push_back(getConstant(NewVal.lshr(i * ViaEltSizeInBits)
	.zextOrTrunc(ViaEltSizeInBits), DL,
	ViaEltVT, isT, isO));
	}

	// EltParts is currently in little endian order. If we actually want
	// big-endian order then reverse it now.
	if (getDataLayout().isBigEndian())
	std::reverse(EltParts.begin(), EltParts.end());

	// The elements must be reversed when the element order is different
	// to the endianness of the elements (because the BITCAST is itself a
	// vector shuffle in this situation). However, we do not need any code to
	// perform this reversal because getConstant() is producing a vector
	// splat.
	// This situation occurs in MIPS MSA.

	SmallVector<SDValue, 8> Ops;
	for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
	Ops.insert(Ops.end(), EltParts.begin(), EltParts.end());
	return getNode(ISD::BITCAST, DL, VT, getBuildVector(ViaVecVT, DL, Ops));
	}

	assert(Elt->getBitWidth() == EltVT.getSizeInBits() &&
	"APInt size does not match type size!");
	unsigned Opc = isT ? ISD::TargetConstant : ISD::Constant;
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, Opc, getVTList(EltVT), None);
	ID.AddPointer(Elt);
	ID.AddBoolean(isO);
	void *IP = nullptr;
	SDNode *N = nullptr;
	if ((N = FindNodeOrInsertPos(ID, DL, IP)))
	if (!VT.isVector())
	return SDValue(N, 0);

	if (!N) {
	N = newSDNode<ConstantSDNode>(isT, isO, Elt, DL.getDebugLoc(), EltVT);
	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	}

	SDValue Result(N, 0);
	if (VT.isVector())
	Result = getSplatBuildVector(VT, DL, Result);
	return Result;
	}

	SDValue SelectionDAG::getIntPtrConstant(uint64_t Val, const SDLoc &DL,
	bool isTarget) {
	return getConstant(Val, DL, TLI->getPointerTy(getDataLayout()), isTarget);
	}

	SDValue SelectionDAG::getConstantFP(const APFloat &V, const SDLoc &DL, EVT VT,
	bool isTarget) {
	return getConstantFP(ConstantFP::get(getContext(), V), DL, VT, isTarget);
	}

	SDValue SelectionDAG::getConstantFP(const ConstantFP &V, const SDLoc &DL,
	EVT VT, bool isTarget) {
	assert(VT.isFloatingPoint() && "Cannot create integer FP constant!");

	EVT EltVT = VT.getScalarType();

	// Do the map lookup using the actual bit pattern for the floating point
	// value, so that we don't have problems with 0.0 comparing equal to -0.0, and
	// we don't have issues with SNANs.
	unsigned Opc = isTarget ? ISD::TargetConstantFP : ISD::ConstantFP;
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, Opc, getVTList(EltVT), None);
	ID.AddPointer(&V);
	void *IP = nullptr;
	SDNode *N = nullptr;
	if ((N = FindNodeOrInsertPos(ID, DL, IP)))
	if (!VT.isVector())
	return SDValue(N, 0);

	if (!N) {
	N = newSDNode<ConstantFPSDNode>(isTarget, &V, DL.getDebugLoc(), EltVT);
	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	}

	SDValue Result(N, 0);
	if (VT.isVector())
	Result = getSplatBuildVector(VT, DL, Result);
	return Result;
	}

	SDValue SelectionDAG::getConstantFP(double Val, const SDLoc &DL, EVT VT,
	bool isTarget) {
	EVT EltVT = VT.getScalarType();
	if (EltVT == MVT::f32)
	return getConstantFP(APFloat((float)Val), DL, VT, isTarget);
	else if (EltVT == MVT::f64)
	return getConstantFP(APFloat(Val), DL, VT, isTarget);
	else if (EltVT == MVT::f80 \|\| EltVT == MVT::f128 \|\| EltVT == MVT::ppcf128 \|\|
	EltVT == MVT::f16) {
	bool Ignored;
	APFloat APF = APFloat(Val);
	APF.convert(EVTToAPFloatSemantics(EltVT), APFloat::rmNearestTiesToEven,
	&Ignored);
	return getConstantFP(APF, DL, VT, isTarget);
	} else
	llvm_unreachable("Unsupported type in getConstantFP");
	}

	SDValue SelectionDAG::getGlobalAddress(const GlobalValue *GV, const SDLoc &DL,
	EVT VT, int64_t Offset, bool isTargetGA,
	unsigned char TargetFlags) {
	assert((TargetFlags == 0 \|\| isTargetGA) &&
	"Cannot set target flags on target-independent globals");

	// Truncate (with sign-extension) the offset value to the pointer size.
	unsigned BitWidth = getDataLayout().getPointerTypeSizeInBits(GV->getType());
	if (BitWidth < 64)
	Offset = SignExtend64(Offset, BitWidth);

	unsigned Opc;
	if (GV->isThreadLocal())
	Opc = isTargetGA ? ISD::TargetGlobalTLSAddress : ISD::GlobalTLSAddress;
	else
	Opc = isTargetGA ? ISD::TargetGlobalAddress : ISD::GlobalAddress;

	FoldingSetNodeID ID;
	AddNodeIDNode(ID, Opc, getVTList(VT), None);
	ID.AddPointer(GV);
	ID.AddInteger(Offset);
	ID.AddInteger(TargetFlags);
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP))
	return SDValue(E, 0);

	auto *N = newSDNode<GlobalAddressSDNode>(
	Opc, DL.getIROrder(), DL.getDebugLoc(), GV, VT, Offset, TargetFlags);
	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getFrameIndex(int FI, EVT VT, bool isTarget) {
	unsigned Opc = isTarget ? ISD::TargetFrameIndex : ISD::FrameIndex;
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, Opc, getVTList(VT), None);
	ID.AddInteger(FI);
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, IP))
	return SDValue(E, 0);

	auto *N = newSDNode<FrameIndexSDNode>(FI, VT, isTarget);
	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getJumpTable(int JTI, EVT VT, bool isTarget,
	unsigned char TargetFlags) {
	assert((TargetFlags == 0 \|\| isTarget) &&
	"Cannot set target flags on target-independent jump tables");
	unsigned Opc = isTarget ? ISD::TargetJumpTable : ISD::JumpTable;
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, Opc, getVTList(VT), None);
	ID.AddInteger(JTI);
	ID.AddInteger(TargetFlags);
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, IP))
	return SDValue(E, 0);

	auto *N = newSDNode<JumpTableSDNode>(JTI, VT, isTarget, TargetFlags);
	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getConstantPool(const Constant *C, EVT VT,
	unsigned Alignment, int Offset,
	bool isTarget,
	unsigned char TargetFlags) {
	assert((TargetFlags == 0 \|\| isTarget) &&
	"Cannot set target flags on target-independent globals");
	if (Alignment == 0)
	Alignment = MF->getFunction()->optForSize()
	? getDataLayout().getABITypeAlignment(C->getType())
	: getDataLayout().getPrefTypeAlignment(C->getType());
	unsigned Opc = isTarget ? ISD::TargetConstantPool : ISD::ConstantPool;
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, Opc, getVTList(VT), None);
	ID.AddInteger(Alignment);
	ID.AddInteger(Offset);
	ID.AddPointer(C);
	ID.AddInteger(TargetFlags);
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, IP))
	return SDValue(E, 0);

	auto *N = newSDNode<ConstantPoolSDNode>(isTarget, C, VT, Offset, Alignment,
	TargetFlags);
	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getConstantPool(MachineConstantPoolValue *C, EVT VT,
	unsigned Alignment, int Offset,
	bool isTarget,
	unsigned char TargetFlags) {
	assert((TargetFlags == 0 \|\| isTarget) &&
	"Cannot set target flags on target-independent globals");
	if (Alignment == 0)
	Alignment = getDataLayout().getPrefTypeAlignment(C->getType());
	unsigned Opc = isTarget ? ISD::TargetConstantPool : ISD::ConstantPool;
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, Opc, getVTList(VT), None);
	ID.AddInteger(Alignment);
	ID.AddInteger(Offset);
	C->addSelectionDAGCSEId(ID);
	ID.AddInteger(TargetFlags);
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, IP))
	return SDValue(E, 0);

	auto *N = newSDNode<ConstantPoolSDNode>(isTarget, C, VT, Offset, Alignment,
	TargetFlags);
	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getTargetIndex(int Index, EVT VT, int64_t Offset,
	unsigned char TargetFlags) {
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, ISD::TargetIndex, getVTList(VT), None);
	ID.AddInteger(Index);
	ID.AddInteger(Offset);
	ID.AddInteger(TargetFlags);
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, IP))
	return SDValue(E, 0);

	auto *N = newSDNode<TargetIndexSDNode>(Index, VT, Offset, TargetFlags);
	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getBasicBlock(MachineBasicBlock *MBB) {
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, ISD::BasicBlock, getVTList(MVT::Other), None);
	ID.AddPointer(MBB);
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, IP))
	return SDValue(E, 0);

	auto *N = newSDNode<BasicBlockSDNode>(MBB);
	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getValueType(EVT VT) {
	if (VT.isSimple() && (unsigned)VT.getSimpleVT().SimpleTy >=
	ValueTypeNodes.size())
	ValueTypeNodes.resize(VT.getSimpleVT().SimpleTy+1);

	SDNode *&N = VT.isExtended() ?
	ExtendedValueTypeNodes[VT] : ValueTypeNodes[VT.getSimpleVT().SimpleTy];

	if (N) return SDValue(N, 0);
	N = newSDNode<VTSDNode>(VT);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getExternalSymbol(const char *Sym, EVT VT) {
	SDNode *&N = ExternalSymbols[Sym];
	if (N) return SDValue(N, 0);
	N = newSDNode<ExternalSymbolSDNode>(false, Sym, 0, VT);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getMCSymbol(MCSymbol *Sym, EVT VT) {
	SDNode *&N = MCSymbols[Sym];
	if (N)
	return SDValue(N, 0);
	N = newSDNode<MCSymbolSDNode>(Sym, VT);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getTargetExternalSymbol(const char *Sym, EVT VT,
	unsigned char TargetFlags) {
	SDNode *&N =
	TargetExternalSymbols[std::pair<std::string,unsigned char>(Sym,
	TargetFlags)];
	if (N) return SDValue(N, 0);
	N = newSDNode<ExternalSymbolSDNode>(true, Sym, TargetFlags, VT);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getCondCode(ISD::CondCode Cond) {
	if ((unsigned)Cond >= CondCodeNodes.size())
	CondCodeNodes.resize(Cond+1);

	if (!CondCodeNodes[Cond]) {
	auto *N = newSDNode<CondCodeSDNode>(Cond);
	CondCodeNodes[Cond] = N;
	InsertNode(N);
	}

	return SDValue(CondCodeNodes[Cond], 0);
	}

	/// Swaps the values of N1 and N2. Swaps all indices in the shuffle mask M that
	/// point at N1 to point at N2 and indices that point at N2 to point at N1.
	static void commuteShuffle(SDValue &N1, SDValue &N2, MutableArrayRef<int> M) {
	std::swap(N1, N2);
	ShuffleVectorSDNode::commuteMask(M);
	}

	SDValue SelectionDAG::getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1,
	SDValue N2, ArrayRef<int> Mask) {
	assert(VT.getVectorNumElements() == Mask.size() &&
	"Must have the same number of vector elements as mask elements!");
	assert(VT == N1.getValueType() && VT == N2.getValueType() &&
	"Invalid VECTOR_SHUFFLE");

	// Canonicalize shuffle undef, undef -> undef
	if (N1.isUndef() && N2.isUndef())
	return getUNDEF(VT);

	// Validate that all indices in Mask are within the range of the elements
	// input to the shuffle.
	int NElts = Mask.size();
	assert(llvm::all_of(Mask, [&](int M) { return M < (NElts * 2); }) &&
	"Index out of range");

	// Copy the mask so we can do any needed cleanup.
	SmallVector<int, 8> MaskVec(Mask.begin(), Mask.end());

	// Canonicalize shuffle v, v -> v, undef
	if (N1 == N2) {
	N2 = getUNDEF(VT);
	for (int i = 0; i != NElts; ++i)
	if (MaskVec[i] >= NElts) MaskVec[i] -= NElts;
	}

	// Canonicalize shuffle undef, v -> v, undef. Commute the shuffle mask.
	if (N1.isUndef())
	commuteShuffle(N1, N2, MaskVec);

	// If shuffling a splat, try to blend the splat instead. We do this here so
	// that even when this arises during lowering we don't have to re-handle it.
	auto BlendSplat = [&](BuildVectorSDNode *BV, int Offset) {
	BitVector UndefElements;
	SDValue Splat = BV->getSplatValue(&UndefElements);
	if (!Splat)
	return;

	for (int i = 0; i < NElts; ++i) {
	if (MaskVec[i] < Offset \|\| MaskVec[i] >= (Offset + NElts))
	continue;

	// If this input comes from undef, mark it as such.
	if (UndefElements[MaskVec[i] - Offset]) {
	MaskVec[i] = -1;
	continue;
	}

	// If we can blend a non-undef lane, use that instead.
	if (!UndefElements[i])
	MaskVec[i] = i + Offset;
	}
	};
	if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
	BlendSplat(N1BV, 0);
	if (auto *N2BV = dyn_cast<BuildVectorSDNode>(N2))
	BlendSplat(N2BV, NElts);

	// Canonicalize all index into lhs, -> shuffle lhs, undef
	// Canonicalize all index into rhs, -> shuffle rhs, undef
	bool AllLHS = true, AllRHS = true;
	bool N2Undef = N2.isUndef();
	for (int i = 0; i != NElts; ++i) {
	if (MaskVec[i] >= NElts) {
	if (N2Undef)
	MaskVec[i] = -1;
	else
	AllLHS = false;
	} else if (MaskVec[i] >= 0) {
	AllRHS = false;
	}
	}
	if (AllLHS && AllRHS)
	return getUNDEF(VT);
	if (AllLHS && !N2Undef)
	N2 = getUNDEF(VT);
	if (AllRHS) {
	N1 = getUNDEF(VT);
	commuteShuffle(N1, N2, MaskVec);
	}
	// Reset our undef status after accounting for the mask.
	N2Undef = N2.isUndef();
	// Re-check whether both sides ended up undef.
	if (N1.isUndef() && N2Undef)
	return getUNDEF(VT);

	// If Identity shuffle return that node.
	bool Identity = true, AllSame = true;
	for (int i = 0; i != NElts; ++i) {
	if (MaskVec[i] >= 0 && MaskVec[i] != i) Identity = false;
	if (MaskVec[i] != MaskVec[0]) AllSame = false;
	}
	if (Identity && NElts)
	return N1;

	// Shuffling a constant splat doesn't change the result.
	if (N2Undef) {
	SDValue V = N1;

	// Look through any bitcasts. We check that these don't change the number
	// (and size) of elements and just changes their types.
	while (V.getOpcode() == ISD::BITCAST)
	V = V->getOperand(0);

	// A splat should always show up as a build vector node.
	if (auto *BV = dyn_cast<BuildVectorSDNode>(V)) {
	BitVector UndefElements;
	SDValue Splat = BV->getSplatValue(&UndefElements);
	// If this is a splat of an undef, shuffling it is also undef.
	if (Splat && Splat.isUndef())
	return getUNDEF(VT);

	bool SameNumElts =
	V.getValueType().getVectorNumElements() == VT.getVectorNumElements();

	// We only have a splat which can skip shuffles if there is a splatted
	// value and no undef lanes rearranged by the shuffle.
	if (Splat && UndefElements.none()) {
	// Splat of <x, x, ..., x>, return <x, x, ..., x>, provided that the
	// number of elements match or the value splatted is a zero constant.
	if (SameNumElts)
	return N1;
	if (auto *C = dyn_cast<ConstantSDNode>(Splat))
	if (C->isNullValue())
	return N1;
	}

	// If the shuffle itself creates a splat, build the vector directly.
	if (AllSame && SameNumElts) {
	EVT BuildVT = BV->getValueType(0);
	const SDValue &Splatted = BV->getOperand(MaskVec[0]);
	SDValue NewBV = getSplatBuildVector(BuildVT, dl, Splatted);

	// We may have jumped through bitcasts, so the type of the
	// BUILD_VECTOR may not match the type of the shuffle.
	if (BuildVT != VT)
	NewBV = getNode(ISD::BITCAST, dl, VT, NewBV);
	return NewBV;
	}
	}
	}

	FoldingSetNodeID ID;
	SDValue Ops[2] = { N1, N2 };
	AddNodeIDNode(ID, ISD::VECTOR_SHUFFLE, getVTList(VT), Ops);
	for (int i = 0; i != NElts; ++i)
	ID.AddInteger(MaskVec[i]);

	void* IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP))
	return SDValue(E, 0);

	// Allocate the mask array for the node out of the BumpPtrAllocator, since
	// SDNode doesn't have access to it. This memory will be "leaked" when
	// the node is deallocated, but recovered when the NodeAllocator is released.
	int *MaskAlloc = OperandAllocator.Allocate<int>(NElts);
	std::copy(MaskVec.begin(), MaskVec.end(), MaskAlloc);

	auto *N = newSDNode<ShuffleVectorSDNode>(VT, dl.getIROrder(),
	dl.getDebugLoc(), MaskAlloc);
	createOperands(N, Ops);

	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getCommutedVectorShuffle(const ShuffleVectorSDNode &SV) {
	MVT VT = SV.getSimpleValueType(0);
	SmallVector<int, 8> MaskVec(SV.getMask().begin(), SV.getMask().end());
	ShuffleVectorSDNode::commuteMask(MaskVec);

	SDValue Op0 = SV.getOperand(0);
	SDValue Op1 = SV.getOperand(1);
	return getVectorShuffle(VT, SDLoc(&SV), Op1, Op0, MaskVec);
	}

	SDValue SelectionDAG::getRegister(unsigned RegNo, EVT VT) {
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, ISD::Register, getVTList(VT), None);
	ID.AddInteger(RegNo);
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, IP))
	return SDValue(E, 0);

	auto *N = newSDNode<RegisterSDNode>(RegNo, VT);
	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getRegisterMask(const uint32_t *RegMask) {
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, ISD::RegisterMask, getVTList(MVT::Untyped), None);
	ID.AddPointer(RegMask);
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, IP))
	return SDValue(E, 0);

	auto *N = newSDNode<RegisterMaskSDNode>(RegMask);
	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getEHLabel(const SDLoc &dl, SDValue Root,
	MCSymbol *Label) {
	FoldingSetNodeID ID;
	SDValue Ops[] = { Root };
	AddNodeIDNode(ID, ISD::EH_LABEL, getVTList(MVT::Other), Ops);
	ID.AddPointer(Label);
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, IP))
	return SDValue(E, 0);

	auto *N = newSDNode<EHLabelSDNode>(dl.getIROrder(), dl.getDebugLoc(), Label);
	createOperands(N, Ops);

	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getBlockAddress(const BlockAddress *BA, EVT VT,
	int64_t Offset,
	bool isTarget,
	unsigned char TargetFlags) {
	unsigned Opc = isTarget ? ISD::TargetBlockAddress : ISD::BlockAddress;

	FoldingSetNodeID ID;
	AddNodeIDNode(ID, Opc, getVTList(VT), None);
	ID.AddPointer(BA);
	ID.AddInteger(Offset);
	ID.AddInteger(TargetFlags);
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, IP))
	return SDValue(E, 0);

	auto *N = newSDNode<BlockAddressSDNode>(Opc, VT, BA, Offset, TargetFlags);
	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getSrcValue(const Value *V) {
	assert((!V \|\| V->getType()->isPointerTy()) &&
	"SrcValue is not a pointer?");

	FoldingSetNodeID ID;
	AddNodeIDNode(ID, ISD::SRCVALUE, getVTList(MVT::Other), None);
	ID.AddPointer(V);

	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, IP))
	return SDValue(E, 0);

	auto *N = newSDNode<SrcValueSDNode>(V);
	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getMDNode(const MDNode *MD) {
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, ISD::MDNODE_SDNODE, getVTList(MVT::Other), None);
	ID.AddPointer(MD);

	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, IP))
	return SDValue(E, 0);

	auto *N = newSDNode<MDNodeSDNode>(MD);
	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getBitcast(EVT VT, SDValue V) {
	if (VT == V.getValueType())
	return V;

	return getNode(ISD::BITCAST, SDLoc(V), VT, V);
	}

	SDValue SelectionDAG::getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr,
	unsigned SrcAS, unsigned DestAS) {
	SDValue Ops[] = {Ptr};
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, ISD::ADDRSPACECAST, getVTList(VT), Ops);
	ID.AddInteger(SrcAS);
	ID.AddInteger(DestAS);

	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP))
	return SDValue(E, 0);

	auto *N = newSDNode<AddrSpaceCastSDNode>(dl.getIROrder(), dl.getDebugLoc(),
	VT, SrcAS, DestAS);
	createOperands(N, Ops);

	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	return SDValue(N, 0);
	}

	/// getShiftAmountOperand - Return the specified value casted to
	/// the target's desired shift amount type.
	SDValue SelectionDAG::getShiftAmountOperand(EVT LHSTy, SDValue Op) {
	EVT OpTy = Op.getValueType();
	EVT ShTy = TLI->getShiftAmountTy(LHSTy, getDataLayout());
	if (OpTy == ShTy \|\| OpTy.isVector()) return Op;

	return getZExtOrTrunc(Op, SDLoc(Op), ShTy);
	}

	SDValue SelectionDAG::expandVAArg(SDNode *Node) {
	SDLoc dl(Node);
	const TargetLowering &TLI = getTargetLoweringInfo();
	const Value *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
	EVT VT = Node->getValueType(0);
	SDValue Tmp1 = Node->getOperand(0);
	SDValue Tmp2 = Node->getOperand(1);
	unsigned Align = Node->getConstantOperandVal(3);

	SDValue VAListLoad = getLoad(TLI.getPointerTy(getDataLayout()), dl, Tmp1,
	Tmp2, MachinePointerInfo(V));
	SDValue VAList = VAListLoad;

	if (Align > TLI.getMinStackArgumentAlignment()) {
	assert(((Align & (Align-1)) == 0) && "Expected Align to be a power of 2");

	VAList = getNode(ISD::ADD, dl, VAList.getValueType(), VAList,
	getConstant(Align - 1, dl, VAList.getValueType()));

	VAList = getNode(ISD::AND, dl, VAList.getValueType(), VAList,
	getConstant(-(int64_t)Align, dl, VAList.getValueType()));
	}

	// Increment the pointer, VAList, to the next vaarg
	Tmp1 = getNode(ISD::ADD, dl, VAList.getValueType(), VAList,
	getConstant(getDataLayout().getTypeAllocSize(
	VT.getTypeForEVT(*getContext())),
	dl, VAList.getValueType()));
	// Store the incremented VAList to the legalized pointer
	Tmp1 =
	getStore(VAListLoad.getValue(1), dl, Tmp1, Tmp2, MachinePointerInfo(V));
	// Load the actual argument out of the pointer VAList
	return getLoad(VT, dl, Tmp1, VAList, MachinePointerInfo());
	}

	SDValue SelectionDAG::expandVACopy(SDNode *Node) {
	SDLoc dl(Node);
	const TargetLowering &TLI = getTargetLoweringInfo();
	// This defaults to loading a pointer from the input and storing it to the
	// output, returning the chain.
	const Value *VD = cast<SrcValueSDNode>(Node->getOperand(3))->getValue();
	const Value *VS = cast<SrcValueSDNode>(Node->getOperand(4))->getValue();
	SDValue Tmp1 =
	getLoad(TLI.getPointerTy(getDataLayout()), dl, Node->getOperand(0),
	Node->getOperand(2), MachinePointerInfo(VS));
	return getStore(Tmp1.getValue(1), dl, Tmp1, Node->getOperand(1),
	MachinePointerInfo(VD));
	}

	SDValue SelectionDAG::CreateStackTemporary(EVT VT, unsigned minAlign) {
	MachineFrameInfo &MFI = getMachineFunction().getFrameInfo();
	unsigned ByteSize = VT.getStoreSize();
	Type Ty = VT.getTypeForEVT(getContext());
	unsigned StackAlign =
	std::max((unsigned)getDataLayout().getPrefTypeAlignment(Ty), minAlign);

	int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false);
	return getFrameIndex(FrameIdx, TLI->getFrameIndexTy(getDataLayout()));
	}

	SDValue SelectionDAG::CreateStackTemporary(EVT VT1, EVT VT2) {
	unsigned Bytes = std::max(VT1.getStoreSize(), VT2.getStoreSize());
	Type Ty1 = VT1.getTypeForEVT(getContext());
	Type Ty2 = VT2.getTypeForEVT(getContext());
	const DataLayout &DL = getDataLayout();
	unsigned Align =
	std::max(DL.getPrefTypeAlignment(Ty1), DL.getPrefTypeAlignment(Ty2));

	MachineFrameInfo &MFI = getMachineFunction().getFrameInfo();
	int FrameIdx = MFI.CreateStackObject(Bytes, Align, false);
	return getFrameIndex(FrameIdx, TLI->getFrameIndexTy(getDataLayout()));
	}

	SDValue SelectionDAG::FoldSetCC(EVT VT, SDValue N1, SDValue N2,
	ISD::CondCode Cond, const SDLoc &dl) {
	// These setcc operations always fold.
	switch (Cond) {
	default: break;
	case ISD::SETFALSE:
	case ISD::SETFALSE2: return getConstant(0, dl, VT);
	case ISD::SETTRUE:
	case ISD::SETTRUE2: {
	TargetLowering::BooleanContent Cnt =
	TLI->getBooleanContents(N1->getValueType(0));
	return getConstant(
	Cnt == TargetLowering::ZeroOrNegativeOneBooleanContent ? -1ULL : 1, dl,
	VT);
	}

	case ISD::SETOEQ:
	case ISD::SETOGT:
	case ISD::SETOGE:
	case ISD::SETOLT:
	case ISD::SETOLE:
	case ISD::SETONE:
	case ISD::SETO:
	case ISD::SETUO:
	case ISD::SETUEQ:
	case ISD::SETUNE:
	assert(!N1.getValueType().isInteger() && "Illegal setcc for integer!");
	break;
	}

	if (ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2)) {
	const APInt &C2 = N2C->getAPIntValue();
	if (ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1)) {
	const APInt &C1 = N1C->getAPIntValue();

	switch (Cond) {
	default: llvm_unreachable("Unknown integer setcc!");
	case ISD::SETEQ: return getConstant(C1 == C2, dl, VT);
	case ISD::SETNE: return getConstant(C1 != C2, dl, VT);
	case ISD::SETULT: return getConstant(C1.ult(C2), dl, VT);
	case ISD::SETUGT: return getConstant(C1.ugt(C2), dl, VT);
	case ISD::SETULE: return getConstant(C1.ule(C2), dl, VT);
	case ISD::SETUGE: return getConstant(C1.uge(C2), dl, VT);
	case ISD::SETLT: return getConstant(C1.slt(C2), dl, VT);
	case ISD::SETGT: return getConstant(C1.sgt(C2), dl, VT);
	case ISD::SETLE: return getConstant(C1.sle(C2), dl, VT);
	case ISD::SETGE: return getConstant(C1.sge(C2), dl, VT);
	}
	}
	}
	if (ConstantFPSDNode *N1C = dyn_cast<ConstantFPSDNode>(N1)) {
	if (ConstantFPSDNode *N2C = dyn_cast<ConstantFPSDNode>(N2)) {
	APFloat::cmpResult R = N1C->getValueAPF().compare(N2C->getValueAPF());
	switch (Cond) {
	default: break;
	case ISD::SETEQ: if (R==APFloat::cmpUnordered)
	return getUNDEF(VT);
	LLVM_FALLTHROUGH;
	case ISD::SETOEQ: return getConstant(R==APFloat::cmpEqual, dl, VT);
	case ISD::SETNE: if (R==APFloat::cmpUnordered)
	return getUNDEF(VT);
	LLVM_FALLTHROUGH;
	case ISD::SETONE: return getConstant(R==APFloat::cmpGreaterThan \|\|
	R==APFloat::cmpLessThan, dl, VT);
	case ISD::SETLT: if (R==APFloat::cmpUnordered)
	return getUNDEF(VT);
	LLVM_FALLTHROUGH;
	case ISD::SETOLT: return getConstant(R==APFloat::cmpLessThan, dl, VT);
	case ISD::SETGT: if (R==APFloat::cmpUnordered)
	return getUNDEF(VT);
	LLVM_FALLTHROUGH;
	case ISD::SETOGT: return getConstant(R==APFloat::cmpGreaterThan, dl, VT);
	case ISD::SETLE: if (R==APFloat::cmpUnordered)
	return getUNDEF(VT);
	LLVM_FALLTHROUGH;
	case ISD::SETOLE: return getConstant(R==APFloat::cmpLessThan \|\|
	R==APFloat::cmpEqual, dl, VT);
	case ISD::SETGE: if (R==APFloat::cmpUnordered)
	return getUNDEF(VT);
	LLVM_FALLTHROUGH;
	case ISD::SETOGE: return getConstant(R==APFloat::cmpGreaterThan \|\|
	R==APFloat::cmpEqual, dl, VT);
	case ISD::SETO: return getConstant(R!=APFloat::cmpUnordered, dl, VT);
	case ISD::SETUO: return getConstant(R==APFloat::cmpUnordered, dl, VT);
	case ISD::SETUEQ: return getConstant(R==APFloat::cmpUnordered \|\|
	R==APFloat::cmpEqual, dl, VT);
	case ISD::SETUNE: return getConstant(R!=APFloat::cmpEqual, dl, VT);
	case ISD::SETULT: return getConstant(R==APFloat::cmpUnordered \|\|
	R==APFloat::cmpLessThan, dl, VT);
	case ISD::SETUGT: return getConstant(R==APFloat::cmpGreaterThan \|\|
	R==APFloat::cmpUnordered, dl, VT);
	case ISD::SETULE: return getConstant(R!=APFloat::cmpGreaterThan, dl, VT);
	case ISD::SETUGE: return getConstant(R!=APFloat::cmpLessThan, dl, VT);
	}
	} else {
	// Ensure that the constant occurs on the RHS.
	ISD::CondCode SwappedCond = ISD::getSetCCSwappedOperands(Cond);
	MVT CompVT = N1.getValueType().getSimpleVT();
	if (!TLI->isCondCodeLegal(SwappedCond, CompVT))
	return SDValue();

	return getSetCC(dl, VT, N2, N1, SwappedCond);
	}
	}

	// Could not fold it.
	return SDValue();
	}

	/// SignBitIsZero - Return true if the sign bit of Op is known to be zero. We
	/// use this predicate to simplify operations downstream.
	bool SelectionDAG::SignBitIsZero(SDValue Op, unsigned Depth) const {
	unsigned BitWidth = Op.getScalarValueSizeInBits();
	return MaskedValueIsZero(Op, APInt::getSignMask(BitWidth), Depth);
	}

	/// MaskedValueIsZero - Return true if 'V & Mask' is known to be zero. We use
	/// this predicate to simplify operations downstream. Mask is known to be zero
	/// for bits that V cannot have.
	bool SelectionDAG::MaskedValueIsZero(SDValue Op, const APInt &Mask,
	unsigned Depth) const {
	KnownBits Known;
	computeKnownBits(Op, Known, Depth);
	return Mask.isSubsetOf(Known.Zero);
	}

	/// If a SHL/SRA/SRL node has a constant or splat constant shift amount that
	/// is less than the element bit-width of the shift node, return it.
	static const APInt *getValidShiftAmountConstant(SDValue V) {
	if (ConstantSDNode *SA = isConstOrConstSplat(V.getOperand(1))) {
	// Shifting more than the bitwidth is not valid.
	const APInt &ShAmt = SA->getAPIntValue();
	if (ShAmt.ult(V.getScalarValueSizeInBits()))
	return &ShAmt;
	}
	return nullptr;
	}

	/// Determine which bits of Op are known to be either zero or one and return
	/// them in Known. For vectors, the known bits are those that are shared by
	/// every vector element.
	void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known,
	unsigned Depth) const {
	EVT VT = Op.getValueType();
	APInt DemandedElts = VT.isVector()
	? APInt::getAllOnesValue(VT.getVectorNumElements())
	: APInt(1, 1);
	computeKnownBits(Op, Known, DemandedElts, Depth);
	}

	/// Determine which bits of Op are known to be either zero or one and return
	/// them in Known. The DemandedElts argument allows us to only collect the known
	/// bits that are shared by the requested vector elements.
	void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known,
	const APInt &DemandedElts,
	unsigned Depth) const {
	unsigned BitWidth = Op.getScalarValueSizeInBits();

	Known = KnownBits(BitWidth); // Don't know anything.
	if (Depth == 6)
	return; // Limit search depth.

	KnownBits Known2;
	unsigned NumElts = DemandedElts.getBitWidth();

	if (!DemandedElts)
	return; // No demanded elts, better to assume we don't know anything.

	unsigned Opcode = Op.getOpcode();
	switch (Opcode) {
	case ISD::Constant:
	// We know all of the bits for a constant!
	Known.One = cast<ConstantSDNode>(Op)->getAPIntValue();
	Known.Zero = ~Known.One;
	break;
	case ISD::BUILD_VECTOR:
	// Collect the known bits that are shared by every demanded vector element.
	assert(NumElts == Op.getValueType().getVectorNumElements() &&
	"Unexpected vector size");
	Known.Zero.setAllBits(); Known.One.setAllBits();
	for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
	if (!DemandedElts[i])
	continue;

	SDValue SrcOp = Op.getOperand(i);
	computeKnownBits(SrcOp, Known2, Depth + 1);

	// BUILD_VECTOR can implicitly truncate sources, we must handle this.
	if (SrcOp.getValueSizeInBits() != BitWidth) {
	assert(SrcOp.getValueSizeInBits() > BitWidth &&
	"Expected BUILD_VECTOR implicit truncation");
	Known2 = Known2.trunc(BitWidth);
	}

	// Known bits are the values that are shared by every demanded element.
	Known.One &= Known2.One;
	Known.Zero &= Known2.Zero;

	// If we don't know any bits, early out.
	if (!Known.One && !Known.Zero)
	break;
	}
	break;
	case ISD::VECTOR_SHUFFLE: {
	// Collect the known bits that are shared by every vector element referenced
	// by the shuffle.
	APInt DemandedLHS(NumElts, 0), DemandedRHS(NumElts, 0);
	Known.Zero.setAllBits(); Known.One.setAllBits();
	const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
	assert(NumElts == SVN->getMask().size() && "Unexpected vector size");
	for (unsigned i = 0; i != NumElts; ++i) {
	if (!DemandedElts[i])
	continue;

	int M = SVN->getMaskElt(i);
	if (M < 0) {
	// For UNDEF elements, we don't know anything about the common state of
	// the shuffle result.
	Known.resetAll();
	DemandedLHS.clearAllBits();
	DemandedRHS.clearAllBits();
	break;
	}

	if ((unsigned)M < NumElts)
	DemandedLHS.setBit((unsigned)M % NumElts);
	else
	DemandedRHS.setBit((unsigned)M % NumElts);
	}
	// Known bits are the values that are shared by every demanded element.
	if (!!DemandedLHS) {
	SDValue LHS = Op.getOperand(0);
	computeKnownBits(LHS, Known2, DemandedLHS, Depth + 1);
	Known.One &= Known2.One;
	Known.Zero &= Known2.Zero;
	}
	// If we don't know any bits, early out.
	if (!Known.One && !Known.Zero)
	break;
	if (!!DemandedRHS) {
	SDValue RHS = Op.getOperand(1);
	computeKnownBits(RHS, Known2, DemandedRHS, Depth + 1);
	Known.One &= Known2.One;
	Known.Zero &= Known2.Zero;
	}
	break;
	}
	case ISD::CONCAT_VECTORS: {
	// Split DemandedElts and test each of the demanded subvectors.
	Known.Zero.setAllBits(); Known.One.setAllBits();
	EVT SubVectorVT = Op.getOperand(0).getValueType();
	unsigned NumSubVectorElts = SubVectorVT.getVectorNumElements();
	unsigned NumSubVectors = Op.getNumOperands();
	for (unsigned i = 0; i != NumSubVectors; ++i) {
	APInt DemandedSub = DemandedElts.lshr(i * NumSubVectorElts);
	DemandedSub = DemandedSub.trunc(NumSubVectorElts);
	if (!!DemandedSub) {
	SDValue Sub = Op.getOperand(i);
	computeKnownBits(Sub, Known2, DemandedSub, Depth + 1);
	Known.One &= Known2.One;
	Known.Zero &= Known2.Zero;
	}
	// If we don't know any bits, early out.
	if (!Known.One && !Known.Zero)
	break;
	}
	break;
	}
	case ISD::EXTRACT_SUBVECTOR: {
	// If we know the element index, just demand that subvector elements,
	// otherwise demand them all.
	SDValue Src = Op.getOperand(0);
	ConstantSDNode *SubIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
	unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
	if (SubIdx && SubIdx->getAPIntValue().ule(NumSrcElts - NumElts)) {
	// Offset the demanded elts by the subvector index.
	uint64_t Idx = SubIdx->getZExtValue();
	APInt DemandedSrc = DemandedElts.zext(NumSrcElts).shl(Idx);
	computeKnownBits(Src, Known, DemandedSrc, Depth + 1);
	} else {
	computeKnownBits(Src, Known, Depth + 1);
	}
	break;
	}
	case ISD::BITCAST: {
	SDValue N0 = Op.getOperand(0);
	unsigned SubBitWidth = N0.getScalarValueSizeInBits();

	// Ignore bitcasts from floating point.
	if (!N0.getValueType().isInteger())
	break;

	// Fast handling of 'identity' bitcasts.
	if (BitWidth == SubBitWidth) {
	computeKnownBits(N0, Known, DemandedElts, Depth + 1);
	break;
	}

	// Support big-endian targets when it becomes useful.
	bool IsLE = getDataLayout().isLittleEndian();
	if (!IsLE)
	break;

	// Bitcast 'small element' vector to 'large element' scalar/vector.
	if ((BitWidth % SubBitWidth) == 0) {
	assert(N0.getValueType().isVector() && "Expected bitcast from vector");

	// Collect known bits for the (larger) output by collecting the known
	// bits from each set of sub elements and shift these into place.
	// We need to separately call computeKnownBits for each set of
	// sub elements as the knownbits for each is likely to be different.
	unsigned SubScale = BitWidth / SubBitWidth;
	APInt SubDemandedElts(NumElts * SubScale, 0);
	for (unsigned i = 0; i != NumElts; ++i)
	if (DemandedElts[i])
	SubDemandedElts.setBit(i * SubScale);

	for (unsigned i = 0; i != SubScale; ++i) {
	computeKnownBits(N0, Known2, SubDemandedElts.shl(i),
	Depth + 1);
	Known.One \|= Known2.One.zext(BitWidth).shl(SubBitWidth * i);
	Known.Zero \|= Known2.Zero.zext(BitWidth).shl(SubBitWidth * i);
	}
	}

	// Bitcast 'large element' scalar/vector to 'small element' vector.
	if ((SubBitWidth % BitWidth) == 0) {
	assert(Op.getValueType().isVector() && "Expected bitcast to vector");

	// Collect known bits for the (smaller) output by collecting the known
	// bits from the overlapping larger input elements and extracting the
	// sub sections we actually care about.
	unsigned SubScale = SubBitWidth / BitWidth;
	APInt SubDemandedElts(NumElts / SubScale, 0);
	for (unsigned i = 0; i != NumElts; ++i)
	if (DemandedElts[i])
	SubDemandedElts.setBit(i / SubScale);

	computeKnownBits(N0, Known2, SubDemandedElts, Depth + 1);

	Known.Zero.setAllBits(); Known.One.setAllBits();
	for (unsigned i = 0; i != NumElts; ++i)
	if (DemandedElts[i]) {
	unsigned Offset = (i % SubScale) * BitWidth;
	Known.One &= Known2.One.lshr(Offset).trunc(BitWidth);
	Known.Zero &= Known2.Zero.lshr(Offset).trunc(BitWidth);
	// If we don't know any bits, early out.
	if (!Known.One && !Known.Zero)
	break;
	}
	}
	break;
	}
	case ISD::AND:
	// If either the LHS or the RHS are Zero, the result is zero.
	computeKnownBits(Op.getOperand(1), Known, DemandedElts, Depth + 1);
	computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);

	// Output known-1 bits are only known if set in both the LHS & RHS.
	Known.One &= Known2.One;
	// Output known-0 are known to be clear if zero in either the LHS \| RHS.
	Known.Zero \|= Known2.Zero;
	break;
	case ISD::OR:
	computeKnownBits(Op.getOperand(1), Known, DemandedElts, Depth + 1);
	computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);

	// Output known-0 bits are only known if clear in both the LHS & RHS.
	Known.Zero &= Known2.Zero;
	// Output known-1 are known to be set if set in either the LHS \| RHS.
	Known.One \|= Known2.One;
	break;
	case ISD::XOR: {
	computeKnownBits(Op.getOperand(1), Known, DemandedElts, Depth + 1);
	computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);

	// Output known-0 bits are known if clear or set in both the LHS & RHS.
	APInt KnownZeroOut = (Known.Zero & Known2.Zero) \| (Known.One & Known2.One);
	// Output known-1 are known to be set if set in only one of the LHS, RHS.
	Known.One = (Known.Zero & Known2.One) \| (Known.One & Known2.Zero);
	Known.Zero = KnownZeroOut;
	break;
	}
	case ISD::MUL: {
	computeKnownBits(Op.getOperand(1), Known, DemandedElts, Depth + 1);
	computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);

	// If low bits are zero in either operand, output low known-0 bits.
	// Also compute a conservative estimate for high known-0 bits.
	// More trickiness is possible, but this is sufficient for the
	// interesting case of alignment computation.
	unsigned TrailZ = Known.countMinTrailingZeros() +
	Known2.countMinTrailingZeros();
	unsigned LeadZ = std::max(Known.countMinLeadingZeros() +
	Known2.countMinLeadingZeros(),
	BitWidth) - BitWidth;

	Known.resetAll();
	Known.Zero.setLowBits(std::min(TrailZ, BitWidth));
	Known.Zero.setHighBits(std::min(LeadZ, BitWidth));
	break;
	}
	case ISD::UDIV: {
	// For the purposes of computing leading zeros we can conservatively
	// treat a udiv as a logical right shift by the power of 2 known to
	// be less than the denominator.
	computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
	unsigned LeadZ = Known2.countMinLeadingZeros();

	computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1);
	unsigned RHSMaxLeadingZeros = Known2.countMaxLeadingZeros();
	if (RHSMaxLeadingZeros != BitWidth)
	LeadZ = std::min(BitWidth, LeadZ + BitWidth - RHSMaxLeadingZeros - 1);

	Known.Zero.setHighBits(LeadZ);
	break;
	}
	case ISD::SELECT:
	computeKnownBits(Op.getOperand(2), Known, Depth+1);
	// If we don't know any bits, early out.
	if (!Known.One && !Known.Zero)
	break;
	computeKnownBits(Op.getOperand(1), Known2, Depth+1);

	// Only known if known in both the LHS and RHS.
	Known.One &= Known2.One;
	Known.Zero &= Known2.Zero;
	break;
	case ISD::SELECT_CC:
	computeKnownBits(Op.getOperand(3), Known, Depth+1);
	// If we don't know any bits, early out.
	if (!Known.One && !Known.Zero)
	break;
	computeKnownBits(Op.getOperand(2), Known2, Depth+1);

	// Only known if known in both the LHS and RHS.
	Known.One &= Known2.One;
	Known.Zero &= Known2.Zero;
	break;
	case ISD::SMULO:
	case ISD::UMULO:
	if (Op.getResNo() != 1)
	break;
	// The boolean result conforms to getBooleanContents.
	// If we know the result of a setcc has the top bits zero, use this info.
	// We know that we have an integer-based boolean since these operations
	// are only available for integer.
	if (TLI->getBooleanContents(Op.getValueType().isVector(), false) ==
	TargetLowering::ZeroOrOneBooleanContent &&
	BitWidth > 1)
	Known.Zero.setBitsFrom(1);
	break;
	case ISD::SETCC:
	// If we know the result of a setcc has the top bits zero, use this info.
	if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) ==
	TargetLowering::ZeroOrOneBooleanContent &&
	BitWidth > 1)
	Known.Zero.setBitsFrom(1);
	break;
	case ISD::SHL:
	if (const APInt *ShAmt = getValidShiftAmountConstant(Op)) {
	computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
	Known.Zero <<= *ShAmt;
	Known.One <<= *ShAmt;
	// Low bits are known zero.
	Known.Zero.setLowBits(ShAmt->getZExtValue());
	}
	break;
	case ISD::SRL:
	if (const APInt *ShAmt = getValidShiftAmountConstant(Op)) {
	computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
	Known.Zero.lshrInPlace(*ShAmt);
	Known.One.lshrInPlace(*ShAmt);
	// High bits are known zero.
	Known.Zero.setHighBits(ShAmt->getZExtValue());
	}
	break;
	case ISD::SRA:
	if (const APInt *ShAmt = getValidShiftAmountConstant(Op)) {
	computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
	Known.Zero.lshrInPlace(*ShAmt);
	Known.One.lshrInPlace(*ShAmt);
	// If we know the value of the sign bit, then we know it is copied across
	// the high bits by the shift amount.
	APInt SignMask = APInt::getSignMask(BitWidth);
	SignMask.lshrInPlace(*ShAmt); // Adjust to where it is now in the mask.
	if (Known.Zero.intersects(SignMask)) {
	Known.Zero.setHighBits(ShAmt->getZExtValue());// New bits are known zero.
	} else if (Known.One.intersects(SignMask)) {
	Known.One.setHighBits(ShAmt->getZExtValue()); // New bits are known one.
	}
	}
	break;
	case ISD::SIGN_EXTEND_INREG: {
	EVT EVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
	unsigned EBits = EVT.getScalarSizeInBits();

	// Sign extension. Compute the demanded bits in the result that are not
	// present in the input.
	APInt NewBits = APInt::getHighBitsSet(BitWidth, BitWidth - EBits);

	APInt InSignMask = APInt::getSignMask(EBits);
	APInt InputDemandedBits = APInt::getLowBitsSet(BitWidth, EBits);

	// If the sign extended bits are demanded, we know that the sign
	// bit is demanded.
	InSignMask = InSignMask.zext(BitWidth);
	if (NewBits.getBoolValue())
	InputDemandedBits \|= InSignMask;

	computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
	Known.One &= InputDemandedBits;
	Known.Zero &= InputDemandedBits;

	// If the sign bit of the input is known set or clear, then we know the
	// top bits of the result.
	if (Known.Zero.intersects(InSignMask)) { // Input sign bit known clear
	Known.Zero \|= NewBits;
	Known.One &= ~NewBits;
	} else if (Known.One.intersects(InSignMask)) { // Input sign bit known set
	Known.One \|= NewBits;
	Known.Zero &= ~NewBits;
	} else { // Input sign bit unknown
	Known.Zero &= ~NewBits;
	Known.One &= ~NewBits;
	}
	break;
	}
	case ISD::CTTZ:
	case ISD::CTTZ_ZERO_UNDEF: {
	computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
	// If we have a known 1, its position is our upper bound.
	unsigned PossibleTZ = Known2.countMaxTrailingZeros();
	unsigned LowBits = Log2_32(PossibleTZ) + 1;
	Known.Zero.setBitsFrom(LowBits);
	break;
	}
	case ISD::CTLZ:
	case ISD::CTLZ_ZERO_UNDEF: {
	computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
	// If we have a known 1, its position is our upper bound.
	unsigned PossibleLZ = Known2.countMaxLeadingZeros();
	unsigned LowBits = Log2_32(PossibleLZ) + 1;
	Known.Zero.setBitsFrom(LowBits);
	break;
	}
	case ISD::CTPOP: {
	computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
	// If we know some of the bits are zero, they can't be one.
	unsigned PossibleOnes = Known2.countMaxPopulation();
	Known.Zero.setBitsFrom(Log2_32(PossibleOnes) + 1);
	break;
	}
	case ISD::LOAD: {
	LoadSDNode *LD = cast<LoadSDNode>(Op);
	// If this is a ZEXTLoad and we are looking at the loaded value.
	if (ISD::isZEXTLoad(Op.getNode()) && Op.getResNo() == 0) {
	EVT VT = LD->getMemoryVT();
	unsigned MemBits = VT.getScalarSizeInBits();
	Known.Zero.setBitsFrom(MemBits);
	} else if (const MDNode *Ranges = LD->getRanges()) {
	if (LD->getExtensionType() == ISD::NON_EXTLOAD)
	computeKnownBitsFromRangeMetadata(*Ranges, Known);
	}
	break;
	}
	case ISD::ZERO_EXTEND_VECTOR_INREG: {
	EVT InVT = Op.getOperand(0).getValueType();
	unsigned InBits = InVT.getScalarSizeInBits();
	Known = Known.trunc(InBits);
	computeKnownBits(Op.getOperand(0), Known,
	DemandedElts.zext(InVT.getVectorNumElements()),
	Depth + 1);
	Known = Known.zext(BitWidth);
	Known.Zero.setBitsFrom(InBits);
	break;
	}
	case ISD::ZERO_EXTEND: {
	EVT InVT = Op.getOperand(0).getValueType();
	unsigned InBits = InVT.getScalarSizeInBits();
	Known = Known.trunc(InBits);
	computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
	Known = Known.zext(BitWidth);
	Known.Zero.setBitsFrom(InBits);
	break;
	}
	// TODO ISD::SIGN_EXTEND_VECTOR_INREG
	case ISD::SIGN_EXTEND: {
	EVT InVT = Op.getOperand(0).getValueType();
	unsigned InBits = InVT.getScalarSizeInBits();

	Known = Known.trunc(InBits);
	computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);

	// If the sign bit is known to be zero or one, then sext will extend
	// it to the top bits, else it will just zext.
	Known = Known.sext(BitWidth);
	break;
	}
	case ISD::ANY_EXTEND: {
	EVT InVT = Op.getOperand(0).getValueType();
	unsigned InBits = InVT.getScalarSizeInBits();
	Known = Known.trunc(InBits);
	computeKnownBits(Op.getOperand(0), Known, Depth+1);
	Known = Known.zext(BitWidth);
	break;
	}
	case ISD::TRUNCATE: {
	EVT InVT = Op.getOperand(0).getValueType();
	unsigned InBits = InVT.getScalarSizeInBits();
	Known = Known.zext(InBits);
	computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
	Known = Known.trunc(BitWidth);
	break;
	}
	case ISD::AssertZext: {
	EVT VT = cast<VTSDNode>(Op.getOperand(1))->getVT();
	APInt InMask = APInt::getLowBitsSet(BitWidth, VT.getSizeInBits());
	computeKnownBits(Op.getOperand(0), Known, Depth+1);
	Known.Zero \|= (~InMask);
	Known.One &= (~Known.Zero);
	break;
	}
	case ISD::FGETSIGN:
	// All bits are zero except the low bit.
	Known.Zero.setBitsFrom(1);
	break;
	case ISD::USUBO:
	case ISD::SSUBO:
	if (Op.getResNo() == 1) {
	// If we know the result of a setcc has the top bits zero, use this info.
	if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) ==
	TargetLowering::ZeroOrOneBooleanContent &&
	BitWidth > 1)
	Known.Zero.setBitsFrom(1);
	break;
	}
	LLVM_FALLTHROUGH;
	case ISD::SUB:
	case ISD::SUBC: {
	if (ConstantSDNode *CLHS = isConstOrConstSplat(Op.getOperand(0))) {
	// We know that the top bits of C-X are clear if X contains less bits
	// than C (i.e. no wrap-around can happen). For example, 20-X is
	// positive if we can prove that X is >= 0 and < 16.
	if (CLHS->getAPIntValue().isNonNegative()) {
	unsigned NLZ = (CLHS->getAPIntValue()+1).countLeadingZeros();
	// NLZ can't be BitWidth with no sign bit
	APInt MaskV = APInt::getHighBitsSet(BitWidth, NLZ+1);
	computeKnownBits(Op.getOperand(1), Known2, DemandedElts,
	Depth + 1);

	// If all of the MaskV bits are known to be zero, then we know the
	// output top bits are zero, because we now know that the output is
	// from [0-C].
	if ((Known2.Zero & MaskV) == MaskV) {
	unsigned NLZ2 = CLHS->getAPIntValue().countLeadingZeros();
	// Top bits known zero.
	Known.Zero.setHighBits(NLZ2);
	}
	}
	}

	// If low bits are know to be zero in both operands, then we know they are
	// going to be 0 in the result. Both addition and complement operations
	// preserve the low zero bits.
	computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
	unsigned KnownZeroLow = Known2.countMinTrailingZeros();
	if (KnownZeroLow == 0)
	break;

	computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1);
	KnownZeroLow = std::min(KnownZeroLow, Known2.countMinTrailingZeros());
	Known.Zero.setLowBits(KnownZeroLow);
	break;
	}
	case ISD::UADDO:
	case ISD::SADDO:
	case ISD::ADDCARRY:
	if (Op.getResNo() == 1) {
	// If we know the result of a setcc has the top bits zero, use this info.
	if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) ==
	TargetLowering::ZeroOrOneBooleanContent &&
	BitWidth > 1)
	Known.Zero.setBitsFrom(1);
	break;
	}
	LLVM_FALLTHROUGH;
	case ISD::ADD:
	case ISD::ADDC:
	case ISD::ADDE: {
	// Output known-0 bits are known if clear or set in both the low clear bits
	// common to both LHS & RHS. For example, 8+(X<<3) is known to have the
	// low 3 bits clear.
	// Output known-0 bits are also known if the top bits of each input are
	// known to be clear. For example, if one input has the top 10 bits clear
	// and the other has the top 8 bits clear, we know the top 7 bits of the
	// output must be clear.
	computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
	unsigned KnownZeroHigh = Known2.countMinLeadingZeros();
	unsigned KnownZeroLow = Known2.countMinTrailingZeros();

	computeKnownBits(Op.getOperand(1), Known2, DemandedElts,
	Depth + 1);
	KnownZeroHigh = std::min(KnownZeroHigh, Known2.countMinLeadingZeros());
	KnownZeroLow = std::min(KnownZeroLow, Known2.countMinTrailingZeros());

	if (Opcode == ISD::ADDE \|\| Opcode == ISD::ADDCARRY) {
	// With ADDE and ADDCARRY, a carry bit may be added in, so we can only
	// use this information if we know (at least) that the low two bits are
	// clear. We then return to the caller that the low bit is unknown but
	// that other bits are known zero.
	if (KnownZeroLow >= 2)
	Known.Zero.setBits(1, KnownZeroLow);
	break;
	}

	Known.Zero.setLowBits(KnownZeroLow);
	if (KnownZeroHigh > 1)
	Known.Zero.setHighBits(KnownZeroHigh - 1);
	break;
	}
	case ISD::SREM:
	if (ConstantSDNode *Rem = isConstOrConstSplat(Op.getOperand(1))) {
	const APInt &RA = Rem->getAPIntValue().abs();
	if (RA.isPowerOf2()) {
	APInt LowBits = RA - 1;
	computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);

	// The low bits of the first operand are unchanged by the srem.
	Known.Zero = Known2.Zero & LowBits;
	Known.One = Known2.One & LowBits;

	// If the first operand is non-negative or has all low bits zero, then
	// the upper bits are all zero.
	if (Known2.Zero[BitWidth-1] \|\| ((Known2.Zero & LowBits) == LowBits))
	Known.Zero \|= ~LowBits;

	// If the first operand is negative and not all low bits are zero, then
	// the upper bits are all one.
	if (Known2.One[BitWidth-1] && ((Known2.One & LowBits) != 0))
	Known.One \|= ~LowBits;
	assert((Known.Zero & Known.One) == 0&&"Bits known to be one AND zero?");
	}
	}
	break;
	case ISD::UREM: {
	if (ConstantSDNode *Rem = isConstOrConstSplat(Op.getOperand(1))) {
	const APInt &RA = Rem->getAPIntValue();
	if (RA.isPowerOf2()) {
	APInt LowBits = (RA - 1);
	computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);

	// The upper bits are all zero, the lower ones are unchanged.
	Known.Zero = Known2.Zero \| ~LowBits;
	Known.One = Known2.One & LowBits;
	break;
	}
	}

	// Since the result is less than or equal to either operand, any leading
	// zero bits in either operand must also exist in the result.
	computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
	computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1);

	uint32_t Leaders =
	std::max(Known.countMinLeadingZeros(), Known2.countMinLeadingZeros());
	Known.resetAll();
	Known.Zero.setHighBits(Leaders);
	break;
	}
	case ISD::EXTRACT_ELEMENT: {
	computeKnownBits(Op.getOperand(0), Known, Depth+1);
	const unsigned Index = Op.getConstantOperandVal(1);
	const unsigned BitWidth = Op.getValueSizeInBits();

	// Remove low part of known bits mask
	Known.Zero = Known.Zero.getHiBits(Known.Zero.getBitWidth() - Index * BitWidth);
	Known.One = Known.One.getHiBits(Known.One.getBitWidth() - Index * BitWidth);

	// Remove high part of known bit mask
	Known = Known.trunc(BitWidth);
	break;
	}
	case ISD::EXTRACT_VECTOR_ELT: {
	SDValue InVec = Op.getOperand(0);
	SDValue EltNo = Op.getOperand(1);
	EVT VecVT = InVec.getValueType();
	const unsigned BitWidth = Op.getValueSizeInBits();
	const unsigned EltBitWidth = VecVT.getScalarSizeInBits();
	const unsigned NumSrcElts = VecVT.getVectorNumElements();
	// If BitWidth > EltBitWidth the value is anyext:ed. So we do not know
	// anything about the extended bits.
	if (BitWidth > EltBitWidth)
	Known = Known.trunc(EltBitWidth);
	ConstantSDNode *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo);
	if (ConstEltNo && ConstEltNo->getAPIntValue().ult(NumSrcElts)) {
	// If we know the element index, just demand that vector element.
	unsigned Idx = ConstEltNo->getZExtValue();
	APInt DemandedElt = APInt::getOneBitSet(NumSrcElts, Idx);
	computeKnownBits(InVec, Known, DemandedElt, Depth + 1);
	} else {
	// Unknown element index, so ignore DemandedElts and demand them all.
	computeKnownBits(InVec, Known, Depth + 1);
	}
	if (BitWidth > EltBitWidth)
	Known = Known.zext(BitWidth);
	break;
	}
	case ISD::INSERT_VECTOR_ELT: {
	SDValue InVec = Op.getOperand(0);
	SDValue InVal = Op.getOperand(1);
	SDValue EltNo = Op.getOperand(2);

	ConstantSDNode *CEltNo = dyn_cast<ConstantSDNode>(EltNo);
	if (CEltNo && CEltNo->getAPIntValue().ult(NumElts)) {
	// If we know the element index, split the demand between the
	// source vector and the inserted element.
	Known.Zero = Known.One = APInt::getAllOnesValue(BitWidth);
	unsigned EltIdx = CEltNo->getZExtValue();

	// If we demand the inserted element then add its common known bits.
	if (DemandedElts[EltIdx]) {
	computeKnownBits(InVal, Known2, Depth + 1);
	Known.One &= Known2.One.zextOrTrunc(Known.One.getBitWidth());
	Known.Zero &= Known2.Zero.zextOrTrunc(Known.Zero.getBitWidth());
	}

	// If we demand the source vector then add its common known bits, ensuring
	// that we don't demand the inserted element.
	APInt VectorElts = DemandedElts & ~(APInt::getOneBitSet(NumElts, EltIdx));
	if (!!VectorElts) {
	computeKnownBits(InVec, Known2, VectorElts, Depth + 1);
	Known.One &= Known2.One;
	Known.Zero &= Known2.Zero;
	}
	} else {
	// Unknown element index, so ignore DemandedElts and demand them all.
	computeKnownBits(InVec, Known, Depth + 1);
	computeKnownBits(InVal, Known2, Depth + 1);
	Known.One &= Known2.One.zextOrTrunc(Known.One.getBitWidth());
	Known.Zero &= Known2.Zero.zextOrTrunc(Known.Zero.getBitWidth());
	}
	break;
	}
	case ISD::BITREVERSE: {
	computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
	Known.Zero = Known2.Zero.reverseBits();
	Known.One = Known2.One.reverseBits();
	break;
	}
	case ISD::BSWAP: {
	computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
	Known.Zero = Known2.Zero.byteSwap();
	Known.One = Known2.One.byteSwap();
	break;
	}
	case ISD::ABS: {
	computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);

	// If the source's MSB is zero then we know the rest of the bits already.
	if (Known2.isNonNegative()) {
	Known.Zero = Known2.Zero;
	Known.One = Known2.One;
	break;
	}

	// We only know that the absolute values's MSB will be zero iff there is
	// a set bit that isn't the sign bit (otherwise it could be INT_MIN).
	Known2.One.clearSignBit();
	if (Known2.One.getBoolValue()) {
	Known.Zero = APInt::getSignMask(BitWidth);
	break;
	}
	break;
	}
	case ISD::UMIN: {
	computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
	computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1);

	// UMIN - we know that the result will have the maximum of the
	// known zero leading bits of the inputs.
	unsigned LeadZero = Known.countMinLeadingZeros();
	LeadZero = std::max(LeadZero, Known2.countMinLeadingZeros());

	Known.Zero &= Known2.Zero;
	Known.One &= Known2.One;
	Known.Zero.setHighBits(LeadZero);
	break;
	}
	case ISD::UMAX: {
	computeKnownBits(Op.getOperand(0), Known, DemandedElts,
	Depth + 1);
	computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1);

	// UMAX - we know that the result will have the maximum of the
	// known one leading bits of the inputs.
	unsigned LeadOne = Known.countMinLeadingOnes();
	LeadOne = std::max(LeadOne, Known2.countMinLeadingOnes());

	Known.Zero &= Known2.Zero;
	Known.One &= Known2.One;
	Known.One.setHighBits(LeadOne);
	break;
	}
	case ISD::SMIN:
	case ISD::SMAX: {
	computeKnownBits(Op.getOperand(0), Known, DemandedElts,
	Depth + 1);
	// If we don't know any bits, early out.
	if (!Known.One && !Known.Zero)
	break;
	computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1);
	Known.Zero &= Known2.Zero;
	Known.One &= Known2.One;
	break;
	}
	case ISD::FrameIndex:
	case ISD::TargetFrameIndex:
	if (unsigned Align = InferPtrAlignment(Op)) {
	// The low bits are known zero if the pointer is aligned.
	Known.Zero.setLowBits(Log2_32(Align));
	break;
	}
	break;

	default:
	if (Opcode < ISD::BUILTIN_OP_END)
	break;
	LLVM_FALLTHROUGH;
	case ISD::INTRINSIC_WO_CHAIN:
	case ISD::INTRINSIC_W_CHAIN:
	case ISD::INTRINSIC_VOID:
	// Allow the target to implement this method for its nodes.
	TLI->computeKnownBitsForTargetNode(Op, Known, DemandedElts, *this, Depth);
	break;
	}

	assert((Known.Zero & Known.One) == 0 && "Bits known to be one AND zero?");
	}

	SelectionDAG::OverflowKind SelectionDAG::computeOverflowKind(SDValue N0,
	SDValue N1) const {
	// X + 0 never overflow
	if (isNullConstant(N1))
	return OFK_Never;

	KnownBits N1Known;
	computeKnownBits(N1, N1Known);
	if (N1Known.Zero.getBoolValue()) {
	KnownBits N0Known;
	computeKnownBits(N0, N0Known);

	bool overflow;
	(void)(~N0Known.Zero).uadd_ov(~N1Known.Zero, overflow);
	if (!overflow)
	return OFK_Never;
	}

	// mulhi + 1 never overflow
	if (N0.getOpcode() == ISD::UMUL_LOHI && N0.getResNo() == 1 &&
	(~N1Known.Zero & 0x01) == ~N1Known.Zero)
	return OFK_Never;

	if (N1.getOpcode() == ISD::UMUL_LOHI && N1.getResNo() == 1) {
	KnownBits N0Known;
	computeKnownBits(N0, N0Known);

	if ((~N0Known.Zero & 0x01) == ~N0Known.Zero)
	return OFK_Never;
	}

	return OFK_Sometime;
	}

	bool SelectionDAG::isKnownToBeAPowerOfTwo(SDValue Val) const {
	EVT OpVT = Val.getValueType();
	unsigned BitWidth = OpVT.getScalarSizeInBits();

	// Is the constant a known power of 2?
	if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(Val))
	return Const->getAPIntValue().zextOrTrunc(BitWidth).isPowerOf2();

	// A left-shift of a constant one will have exactly one bit set because
	// shifting the bit off the end is undefined.
	if (Val.getOpcode() == ISD::SHL) {
	auto *C = isConstOrConstSplat(Val.getOperand(0));
	if (C && C->getAPIntValue() == 1)
	return true;
	}

	// Similarly, a logical right-shift of a constant sign-bit will have exactly
	// one bit set.
	if (Val.getOpcode() == ISD::SRL) {
	auto *C = isConstOrConstSplat(Val.getOperand(0));
	if (C && C->getAPIntValue().isSignMask())
	return true;
	}

	// Are all operands of a build vector constant powers of two?
	if (Val.getOpcode() == ISD::BUILD_VECTOR)
	if (llvm::all_of(Val->ops(), [BitWidth](SDValue E) {
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(E))
	return C->getAPIntValue().zextOrTrunc(BitWidth).isPowerOf2();
	return false;
	}))
	return true;

	// More could be done here, though the above checks are enough
	// to handle some common cases.

	// Fall back to computeKnownBits to catch other known cases.
	KnownBits Known;
	computeKnownBits(Val, Known);
	return (Known.countMaxPopulation() == 1) && (Known.countMinPopulation() == 1);
	}

	unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const {
	EVT VT = Op.getValueType();
	APInt DemandedElts = VT.isVector()
	? APInt::getAllOnesValue(VT.getVectorNumElements())
	: APInt(1, 1);
	return ComputeNumSignBits(Op, DemandedElts, Depth);
	}

	unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
	unsigned Depth) const {
	EVT VT = Op.getValueType();
	assert(VT.isInteger() && "Invalid VT!");
	unsigned VTBits = VT.getScalarSizeInBits();
	unsigned NumElts = DemandedElts.getBitWidth();
	unsigned Tmp, Tmp2;
	unsigned FirstAnswer = 1;

	if (Depth == 6)
	return 1; // Limit search depth.

	if (!DemandedElts)
	return 1; // No demanded elts, better to assume we don't know anything.

	switch (Op.getOpcode()) {
	default: break;
	case ISD::AssertSext:
	Tmp = cast<VTSDNode>(Op.getOperand(1))->getVT().getSizeInBits();
	return VTBits-Tmp+1;
	case ISD::AssertZext:
	Tmp = cast<VTSDNode>(Op.getOperand(1))->getVT().getSizeInBits();
	return VTBits-Tmp;

	case ISD::Constant: {
	const APInt &Val = cast<ConstantSDNode>(Op)->getAPIntValue();
	return Val.getNumSignBits();
	}

	case ISD::BUILD_VECTOR:
	Tmp = VTBits;
	for (unsigned i = 0, e = Op.getNumOperands(); (i < e) && (Tmp > 1); ++i) {
	if (!DemandedElts[i])
	continue;

	SDValue SrcOp = Op.getOperand(i);
	Tmp2 = ComputeNumSignBits(Op.getOperand(i), Depth + 1);

	// BUILD_VECTOR can implicitly truncate sources, we must handle this.
	if (SrcOp.getValueSizeInBits() != VTBits) {
	assert(SrcOp.getValueSizeInBits() > VTBits &&
	"Expected BUILD_VECTOR implicit truncation");
	unsigned ExtraBits = SrcOp.getValueSizeInBits() - VTBits;
	Tmp2 = (Tmp2 > ExtraBits ? Tmp2 - ExtraBits : 1);
	}
	Tmp = std::min(Tmp, Tmp2);
	}
	return Tmp;

	case ISD::VECTOR_SHUFFLE: {
	// Collect the minimum number of sign bits that are shared by every vector
	// element referenced by the shuffle.
	APInt DemandedLHS(NumElts, 0), DemandedRHS(NumElts, 0);
	const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
	assert(NumElts == SVN->getMask().size() && "Unexpected vector size");
	for (unsigned i = 0; i != NumElts; ++i) {
	int M = SVN->getMaskElt(i);
	if (!DemandedElts[i])
	continue;
	// For UNDEF elements, we don't know anything about the common state of
	// the shuffle result.
	if (M < 0)
	return 1;
	if ((unsigned)M < NumElts)
	DemandedLHS.setBit((unsigned)M % NumElts);
	else
	DemandedRHS.setBit((unsigned)M % NumElts);
	}
	Tmp = std::numeric_limits<unsigned>::max();
	if (!!DemandedLHS)
	Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1);
	if (!!DemandedRHS) {
	Tmp2 = ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1);
	Tmp = std::min(Tmp, Tmp2);
	}
	// If we don't know anything, early out and try computeKnownBits fall-back.
	if (Tmp == 1)
	break;
	assert(Tmp <= VTBits && "Failed to determine minimum sign bits");
	return Tmp;
	}

	case ISD::SIGN_EXTEND:
	case ISD::SIGN_EXTEND_VECTOR_INREG:
	Tmp = VTBits - Op.getOperand(0).getScalarValueSizeInBits();
	return ComputeNumSignBits(Op.getOperand(0), Depth+1) + Tmp;

	case ISD::SIGN_EXTEND_INREG:
	// Max of the input and what this extends.
	Tmp = cast<VTSDNode>(Op.getOperand(1))->getVT().getScalarSizeInBits();
	Tmp = VTBits-Tmp+1;

	Tmp2 = ComputeNumSignBits(Op.getOperand(0), Depth+1);
	return std::max(Tmp, Tmp2);

	case ISD::SRA:
	Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth+1);
	// SRA X, C -> adds C sign bits.
	if (ConstantSDNode *C = isConstOrConstSplat(Op.getOperand(1))) {
	APInt ShiftVal = C->getAPIntValue();
	ShiftVal += Tmp;
	Tmp = ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
	}
	return Tmp;
	case ISD::SHL:
	if (ConstantSDNode *C = isConstOrConstSplat(Op.getOperand(1))) {
	// shl destroys sign bits.
	Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1);
	if (C->getAPIntValue().uge(VTBits) \|\| // Bad shift.
	C->getAPIntValue().uge(Tmp)) break; // Shifted all sign bits out.
	return Tmp - C->getZExtValue();
	}
	break;
	case ISD::AND:
	case ISD::OR:
	case ISD::XOR: // NOT is handled here.
	// Logical binary ops preserve the number of sign bits at the worst.
	Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1);
	if (Tmp != 1) {
	Tmp2 = ComputeNumSignBits(Op.getOperand(1), Depth+1);
	FirstAnswer = std::min(Tmp, Tmp2);
	// We computed what we know about the sign bits as our first
	// answer. Now proceed to the generic code that uses
	// computeKnownBits, and pick whichever answer is better.
	}
	break;

	case ISD::SELECT:
	Tmp = ComputeNumSignBits(Op.getOperand(1), Depth+1);
	if (Tmp == 1) return 1; // Early out.
	Tmp2 = ComputeNumSignBits(Op.getOperand(2), Depth+1);
	return std::min(Tmp, Tmp2);
	case ISD::SELECT_CC:
	Tmp = ComputeNumSignBits(Op.getOperand(2), Depth+1);
	if (Tmp == 1) return 1; // Early out.
	Tmp2 = ComputeNumSignBits(Op.getOperand(3), Depth+1);
	return std::min(Tmp, Tmp2);
	case ISD::SMIN:
	case ISD::SMAX:
	case ISD::UMIN:
	case ISD::UMAX:
	Tmp = ComputeNumSignBits(Op.getOperand(0), Depth + 1);
	if (Tmp == 1)
	return 1; // Early out.
	Tmp2 = ComputeNumSignBits(Op.getOperand(1), Depth + 1);
	return std::min(Tmp, Tmp2);
	case ISD::SADDO:
	case ISD::UADDO:
	case ISD::SSUBO:
	case ISD::USUBO:
	case ISD::SMULO:
	case ISD::UMULO:
	if (Op.getResNo() != 1)
	break;
	// The boolean result conforms to getBooleanContents. Fall through.
	// If setcc returns 0/-1, all bits are sign bits.
	// We know that we have an integer-based boolean since these operations
	// are only available for integer.
	if (TLI->getBooleanContents(Op.getValueType().isVector(), false) ==
	TargetLowering::ZeroOrNegativeOneBooleanContent)
	return VTBits;
	break;
	case ISD::SETCC:
	// If setcc returns 0/-1, all bits are sign bits.
	if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) ==
	TargetLowering::ZeroOrNegativeOneBooleanContent)
	return VTBits;
	break;
	case ISD::ROTL:
	case ISD::ROTR:
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
	unsigned RotAmt = C->getZExtValue() & (VTBits-1);

	// Handle rotate right by N like a rotate left by 32-N.
	if (Op.getOpcode() == ISD::ROTR)
	RotAmt = (VTBits-RotAmt) & (VTBits-1);

	// If we aren't rotating out all of the known-in sign bits, return the
	// number that are left. This handles rotl(sext(x), 1) for example.
	Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1);
	if (Tmp > RotAmt+1) return Tmp-RotAmt;
	}
	break;
	case ISD::ADD:
	case ISD::ADDC:
	// Add can have at most one carry bit. Thus we know that the output
	// is, at worst, one more bit than the inputs.
	Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1);
	if (Tmp == 1) return 1; // Early out.

	// Special case decrementing a value (ADD X, -1):
	if (ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
	if (CRHS->isAllOnesValue()) {
	KnownBits Known;
	computeKnownBits(Op.getOperand(0), Known, Depth+1);

	// If the input is known to be 0 or 1, the output is 0/-1, which is all
	// sign bits set.
	if ((Known.Zero \| 1).isAllOnesValue())
	return VTBits;

	// If we are subtracting one from a positive number, there is no carry
	// out of the result.
	if (Known.isNonNegative())
	return Tmp;
	}

	Tmp2 = ComputeNumSignBits(Op.getOperand(1), Depth+1);
	if (Tmp2 == 1) return 1;
	return std::min(Tmp, Tmp2)-1;

	case ISD::SUB:
	Tmp2 = ComputeNumSignBits(Op.getOperand(1), Depth+1);
	if (Tmp2 == 1) return 1;

	// Handle NEG.
	if (ConstantSDNode *CLHS = isConstOrConstSplat(Op.getOperand(0)))
	if (CLHS->isNullValue()) {
	KnownBits Known;
	computeKnownBits(Op.getOperand(1), Known, Depth+1);
	// If the input is known to be 0 or 1, the output is 0/-1, which is all
	// sign bits set.
	if ((Known.Zero \| 1).isAllOnesValue())
	return VTBits;

	// If the input is known to be positive (the sign bit is known clear),
	// the output of the NEG has the same number of sign bits as the input.
	if (Known.isNonNegative())
	return Tmp2;

	// Otherwise, we treat this like a SUB.
	}

	// Sub can have at most one carry bit. Thus we know that the output
	// is, at worst, one more bit than the inputs.
	Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1);
	if (Tmp == 1) return 1; // Early out.
	return std::min(Tmp, Tmp2)-1;
	case ISD::TRUNCATE: {
	// Check if the sign bits of source go down as far as the truncated value.
	unsigned NumSrcBits = Op.getOperand(0).getScalarValueSizeInBits();
	unsigned NumSrcSignBits = ComputeNumSignBits(Op.getOperand(0), Depth + 1);
	if (NumSrcSignBits > (NumSrcBits - VTBits))
	return NumSrcSignBits - (NumSrcBits - VTBits);
	break;
	}
	case ISD::EXTRACT_ELEMENT: {
	const int KnownSign = ComputeNumSignBits(Op.getOperand(0), Depth+1);
	const int BitWidth = Op.getValueSizeInBits();
	const int Items = Op.getOperand(0).getValueSizeInBits() / BitWidth;

	// Get reverse index (starting from 1), Op1 value indexes elements from
	// little end. Sign starts at big end.
	const int rIndex = Items - 1 - Op.getConstantOperandVal(1);

	// If the sign portion ends in our element the subtraction gives correct
	// result. Otherwise it gives either negative or > bitwidth result
	return std::max(std::min(KnownSign - rIndex * BitWidth, BitWidth), 0);
	}
	case ISD::INSERT_VECTOR_ELT: {
	SDValue InVec = Op.getOperand(0);
	SDValue InVal = Op.getOperand(1);
	SDValue EltNo = Op.getOperand(2);
	unsigned NumElts = InVec.getValueType().getVectorNumElements();

	ConstantSDNode *CEltNo = dyn_cast<ConstantSDNode>(EltNo);
	if (CEltNo && CEltNo->getAPIntValue().ult(NumElts)) {
	// If we know the element index, split the demand between the
	// source vector and the inserted element.
	unsigned EltIdx = CEltNo->getZExtValue();

	// If we demand the inserted element then get its sign bits.
	Tmp = std::numeric_limits<unsigned>::max();
	if (DemandedElts[EltIdx]) {
	// TODO - handle implicit truncation of inserted elements.
	if (InVal.getScalarValueSizeInBits() != VTBits)
	break;
	Tmp = ComputeNumSignBits(InVal, Depth + 1);
	}

	// If we demand the source vector then get its sign bits, and determine
	// the minimum.
	APInt VectorElts = DemandedElts;
	VectorElts.clearBit(EltIdx);
	if (!!VectorElts) {
	Tmp2 = ComputeNumSignBits(InVec, VectorElts, Depth + 1);
	Tmp = std::min(Tmp, Tmp2);
	}
	} else {
	// Unknown element index, so ignore DemandedElts and demand them all.
	Tmp = ComputeNumSignBits(InVec, Depth + 1);
	Tmp2 = ComputeNumSignBits(InVal, Depth + 1);
	Tmp = std::min(Tmp, Tmp2);
	}
	assert(Tmp <= VTBits && "Failed to determine minimum sign bits");
	return Tmp;
	}
	case ISD::EXTRACT_VECTOR_ELT: {
	SDValue InVec = Op.getOperand(0);
	SDValue EltNo = Op.getOperand(1);
	EVT VecVT = InVec.getValueType();
	const unsigned BitWidth = Op.getValueSizeInBits();
	const unsigned EltBitWidth = Op.getOperand(0).getScalarValueSizeInBits();
	const unsigned NumSrcElts = VecVT.getVectorNumElements();

	// If BitWidth > EltBitWidth the value is anyext:ed, and we do not know
	// anything about sign bits. But if the sizes match we can derive knowledge
	// about sign bits from the vector operand.
	if (BitWidth != EltBitWidth)
	break;

	// If we know the element index, just demand that vector element, else for
	// an unknown element index, ignore DemandedElts and demand them all.
	APInt DemandedSrcElts = APInt::getAllOnesValue(NumSrcElts);
	ConstantSDNode *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo);
	if (ConstEltNo && ConstEltNo->getAPIntValue().ult(NumSrcElts))
	DemandedSrcElts =
	APInt::getOneBitSet(NumSrcElts, ConstEltNo->getZExtValue());

	return ComputeNumSignBits(InVec, DemandedSrcElts, Depth + 1);
	}
	case ISD::EXTRACT_SUBVECTOR: {
	// If we know the element index, just demand that subvector elements,
	// otherwise demand them all.
	SDValue Src = Op.getOperand(0);
	ConstantSDNode *SubIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
	unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
	if (SubIdx && SubIdx->getAPIntValue().ule(NumSrcElts - NumElts)) {
	// Offset the demanded elts by the subvector index.
	uint64_t Idx = SubIdx->getZExtValue();
	APInt DemandedSrc = DemandedElts.zext(NumSrcElts).shl(Idx);
	return ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
	}
	return ComputeNumSignBits(Src, Depth + 1);
	}
	case ISD::CONCAT_VECTORS:
	// Determine the minimum number of sign bits across all demanded
	// elts of the input vectors. Early out if the result is already 1.
	Tmp = std::numeric_limits<unsigned>::max();
	EVT SubVectorVT = Op.getOperand(0).getValueType();
	unsigned NumSubVectorElts = SubVectorVT.getVectorNumElements();
	unsigned NumSubVectors = Op.getNumOperands();
	for (unsigned i = 0; (i < NumSubVectors) && (Tmp > 1); ++i) {
	APInt DemandedSub = DemandedElts.lshr(i * NumSubVectorElts);
	DemandedSub = DemandedSub.trunc(NumSubVectorElts);
	if (!DemandedSub)
	continue;
	Tmp2 = ComputeNumSignBits(Op.getOperand(i), DemandedSub, Depth + 1);
	Tmp = std::min(Tmp, Tmp2);
	}
	assert(Tmp <= VTBits && "Failed to determine minimum sign bits");
	return Tmp;
	}

	// If we are looking at the loaded value of the SDNode.
	if (Op.getResNo() == 0) {
	// Handle LOADX separately here. EXTLOAD case will fallthrough.
	if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Op)) {
	unsigned ExtType = LD->getExtensionType();
	switch (ExtType) {
	default: break;
	case ISD::SEXTLOAD: // '17' bits known
	Tmp = LD->getMemoryVT().getScalarSizeInBits();
	return VTBits-Tmp+1;
	case ISD::ZEXTLOAD: // '16' bits known
	Tmp = LD->getMemoryVT().getScalarSizeInBits();
	return VTBits-Tmp;
	}
	}
	}

	// Allow the target to implement this method for its nodes.
	if (Op.getOpcode() >= ISD::BUILTIN_OP_END \|\|
	Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN \|\|
	Op.getOpcode() == ISD::INTRINSIC_W_CHAIN \|\|
	Op.getOpcode() == ISD::INTRINSIC_VOID) {
	unsigned NumBits =
	TLI->ComputeNumSignBitsForTargetNode(Op, DemandedElts, *this, Depth);
	if (NumBits > 1)
	FirstAnswer = std::max(FirstAnswer, NumBits);
	}

	// Finally, if we can prove that the top bits of the result are 0's or 1's,
	// use this information.
	KnownBits Known;
	computeKnownBits(Op, Known, DemandedElts, Depth);

	APInt Mask;
	if (Known.isNonNegative()) { // sign bit is 0
	Mask = Known.Zero;
	} else if (Known.isNegative()) { // sign bit is 1;
	Mask = Known.One;
	} else {
	// Nothing known.
	return FirstAnswer;
	}

	// Okay, we know that the sign bit in Mask is set. Use CLZ to determine
	// the number of identical bits in the top of the input value.
	Mask = ~Mask;
	Mask <<= Mask.getBitWidth()-VTBits;
	// Return # leading zeros. We use 'min' here in case Val was zero before
	// shifting. We don't want to return '64' as for an i32 "0".
	return std::max(FirstAnswer, std::min(VTBits, Mask.countLeadingZeros()));
	}

	bool SelectionDAG::isBaseWithConstantOffset(SDValue Op) const {
	if ((Op.getOpcode() != ISD::ADD && Op.getOpcode() != ISD::OR) \|\|
	!isa<ConstantSDNode>(Op.getOperand(1)))
	return false;

	if (Op.getOpcode() == ISD::OR &&
	!MaskedValueIsZero(Op.getOperand(0),
	cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue()))
	return false;

	return true;
	}

	bool SelectionDAG::isKnownNeverNaN(SDValue Op) const {
	// If we're told that NaNs won't happen, assume they won't.
	if (getTarget().Options.NoNaNsFPMath)
	return true;

	if (Op->getFlags().hasNoNaNs())
	return true;

	// If the value is a constant, we can obviously see if it is a NaN or not.
	if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
	return !C->getValueAPF().isNaN();

	// TODO: Recognize more cases here.

	return false;
	}

	bool SelectionDAG::isKnownNeverZero(SDValue Op) const {
	// If the value is a constant, we can obviously see if it is a zero or not.
	if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
	return !C->isZero();

	// TODO: Recognize more cases here.
	switch (Op.getOpcode()) {
	default: break;
	case ISD::OR:
	if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
	return !C->isNullValue();
	break;
	}

	return false;
	}

	bool SelectionDAG::isEqualTo(SDValue A, SDValue B) const {
	// Check the obvious case.
	if (A == B) return true;

	// For for negative and positive zero.
	if (const ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A))
	if (const ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B))
	if (CA->isZero() && CB->isZero()) return true;

	// Otherwise they may not be equal.
	return false;
	}

	bool SelectionDAG::haveNoCommonBitsSet(SDValue A, SDValue B) const {
	assert(A.getValueType() == B.getValueType() &&
	"Values must have the same type");
	KnownBits AKnown, BKnown;
	computeKnownBits(A, AKnown);
	computeKnownBits(B, BKnown);
	return (AKnown.Zero \| BKnown.Zero).isAllOnesValue();
	}

	static SDValue FoldCONCAT_VECTORS(const SDLoc &DL, EVT VT,
	ArrayRef<SDValue> Ops,
	SelectionDAG &DAG) {
	assert(!Ops.empty() && "Can't concatenate an empty list of vectors!");
	assert(llvm::all_of(Ops,
	[Ops](SDValue Op) {
	return Ops[0].getValueType() == Op.getValueType();
	}) &&
	"Concatenation of vectors with inconsistent value types!");
	assert((Ops.size() * Ops[0].getValueType().getVectorNumElements()) ==
	VT.getVectorNumElements() &&
	"Incorrect element count in vector concatenation!");

	if (Ops.size() == 1)
	return Ops[0];

	// Concat of UNDEFs is UNDEF.
	if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
	return DAG.getUNDEF(VT);

	// A CONCAT_VECTOR with all UNDEF/BUILD_VECTOR operands can be
	// simplified to one big BUILD_VECTOR.
	// FIXME: Add support for SCALAR_TO_VECTOR as well.
	EVT SVT = VT.getScalarType();
	SmallVector<SDValue, 16> Elts;
	for (SDValue Op : Ops) {
	EVT OpVT = Op.getValueType();
	if (Op.isUndef())
	Elts.append(OpVT.getVectorNumElements(), DAG.getUNDEF(SVT));
	else if (Op.getOpcode() == ISD::BUILD_VECTOR)
	Elts.append(Op->op_begin(), Op->op_end());
	else
	return SDValue();
	}

	// BUILD_VECTOR requires all inputs to be of the same type, find the
	// maximum type and extend them all.
	for (SDValue Op : Elts)
	SVT = (SVT.bitsLT(Op.getValueType()) ? Op.getValueType() : SVT);

	if (SVT.bitsGT(VT.getScalarType()))
	for (SDValue &Op : Elts)
	Op = DAG.getTargetLoweringInfo().isZExtFree(Op.getValueType(), SVT)
	? DAG.getZExtOrTrunc(Op, DL, SVT)
	: DAG.getSExtOrTrunc(Op, DL, SVT);

	return DAG.getBuildVector(VT, DL, Elts);
	}

	/// Gets or creates the specified node.
	SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT) {
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, Opcode, getVTList(VT), None);
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP))
	return SDValue(E, 0);

	auto *N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(),
	getVTList(VT));
	CSEMap.InsertNode(N, IP);

	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
	SDValue Operand, const SDNodeFlags Flags) {
	// Constant fold unary operations with an integer constant operand. Even
	// opaque constant will be folded, because the folding of unary operations
	// doesn't create new constants with different values. Nevertheless, the
	// opaque flag is preserved during folding to prevent future folding with
	// other constants.
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Operand)) {
	const APInt &Val = C->getAPIntValue();
	switch (Opcode) {
	default: break;
	case ISD::SIGN_EXTEND:
	return getConstant(Val.sextOrTrunc(VT.getSizeInBits()), DL, VT,
	C->isTargetOpcode(), C->isOpaque());
	case ISD::ANY_EXTEND:
	case ISD::ZERO_EXTEND:
	case ISD::TRUNCATE:
	return getConstant(Val.zextOrTrunc(VT.getSizeInBits()), DL, VT,
	C->isTargetOpcode(), C->isOpaque());
	case ISD::UINT_TO_FP:
	case ISD::SINT_TO_FP: {
	APFloat apf(EVTToAPFloatSemantics(VT),
	APInt::getNullValue(VT.getSizeInBits()));
	(void)apf.convertFromAPInt(Val,
	Opcode==ISD::SINT_TO_FP,
	APFloat::rmNearestTiesToEven);
	return getConstantFP(apf, DL, VT);
	}
	case ISD::BITCAST:
	if (VT == MVT::f16 && C->getValueType(0) == MVT::i16)
	return getConstantFP(APFloat(APFloat::IEEEhalf(), Val), DL, VT);
	if (VT == MVT::f32 && C->getValueType(0) == MVT::i32)
	return getConstantFP(APFloat(APFloat::IEEEsingle(), Val), DL, VT);
	if (VT == MVT::f64 && C->getValueType(0) == MVT::i64)
	return getConstantFP(APFloat(APFloat::IEEEdouble(), Val), DL, VT);
	if (VT == MVT::f128 && C->getValueType(0) == MVT::i128)
	return getConstantFP(APFloat(APFloat::IEEEquad(), Val), DL, VT);
	break;
	case ISD::ABS:
	return getConstant(Val.abs(), DL, VT, C->isTargetOpcode(),
	C->isOpaque());
	case ISD::BITREVERSE:
	return getConstant(Val.reverseBits(), DL, VT, C->isTargetOpcode(),
	C->isOpaque());
	case ISD::BSWAP:
	return getConstant(Val.byteSwap(), DL, VT, C->isTargetOpcode(),
	C->isOpaque());
	case ISD::CTPOP:
	return getConstant(Val.countPopulation(), DL, VT, C->isTargetOpcode(),
	C->isOpaque());
	case ISD::CTLZ:
	case ISD::CTLZ_ZERO_UNDEF:
	return getConstant(Val.countLeadingZeros(), DL, VT, C->isTargetOpcode(),
	C->isOpaque());
	case ISD::CTTZ:
	case ISD::CTTZ_ZERO_UNDEF:
	return getConstant(Val.countTrailingZeros(), DL, VT, C->isTargetOpcode(),
	C->isOpaque());
	case ISD::FP16_TO_FP: {
	bool Ignored;
	APFloat FPV(APFloat::IEEEhalf(),
	(Val.getBitWidth() == 16) ? Val : Val.trunc(16));

	// This can return overflow, underflow, or inexact; we don't care.
	// FIXME need to be more flexible about rounding mode.
	(void)FPV.convert(EVTToAPFloatSemantics(VT),
	APFloat::rmNearestTiesToEven, &Ignored);
	return getConstantFP(FPV, DL, VT);
	}
	}
	}

	// Constant fold unary operations with a floating point constant operand.
	if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Operand)) {
	APFloat V = C->getValueAPF(); // make copy
	switch (Opcode) {
	case ISD::FNEG:
	V.changeSign();
	return getConstantFP(V, DL, VT);
	case ISD::FABS:
	V.clearSign();
	return getConstantFP(V, DL, VT);
	case ISD::FCEIL: {
	APFloat::opStatus fs = V.roundToIntegral(APFloat::rmTowardPositive);
	if (fs == APFloat::opOK \|\| fs == APFloat::opInexact)
	return getConstantFP(V, DL, VT);
	break;
	}
	case ISD::FTRUNC: {
	APFloat::opStatus fs = V.roundToIntegral(APFloat::rmTowardZero);
	if (fs == APFloat::opOK \|\| fs == APFloat::opInexact)
	return getConstantFP(V, DL, VT);
	break;
	}
	case ISD::FFLOOR: {
	APFloat::opStatus fs = V.roundToIntegral(APFloat::rmTowardNegative);
	if (fs == APFloat::opOK \|\| fs == APFloat::opInexact)
	return getConstantFP(V, DL, VT);
	break;
	}
	case ISD::FP_EXTEND: {
	bool ignored;
	// This can return overflow, underflow, or inexact; we don't care.
	// FIXME need to be more flexible about rounding mode.
	(void)V.convert(EVTToAPFloatSemantics(VT),
	APFloat::rmNearestTiesToEven, &ignored);
	return getConstantFP(V, DL, VT);
	}
	case ISD::FP_TO_SINT:
	case ISD::FP_TO_UINT: {
	bool ignored;
	APSInt IntVal(VT.getSizeInBits(), Opcode == ISD::FP_TO_UINT);
	// FIXME need to be more flexible about rounding mode.
	APFloat::opStatus s =
	V.convertToInteger(IntVal, APFloat::rmTowardZero, &ignored);
	if (s == APFloat::opInvalidOp) // inexact is OK, in fact usual
	break;
	return getConstant(IntVal, DL, VT);
	}
	case ISD::BITCAST:
	if (VT == MVT::i16 && C->getValueType(0) == MVT::f16)
	return getConstant((uint16_t)V.bitcastToAPInt().getZExtValue(), DL, VT);
	else if (VT == MVT::i32 && C->getValueType(0) == MVT::f32)
	return getConstant((uint32_t)V.bitcastToAPInt().getZExtValue(), DL, VT);
	else if (VT == MVT::i64 && C->getValueType(0) == MVT::f64)
	return getConstant(V.bitcastToAPInt().getZExtValue(), DL, VT);
	break;
	case ISD::FP_TO_FP16: {
	bool Ignored;
	// This can return overflow, underflow, or inexact; we don't care.
	// FIXME need to be more flexible about rounding mode.
	(void)V.convert(APFloat::IEEEhalf(),
	APFloat::rmNearestTiesToEven, &Ignored);
	return getConstant(V.bitcastToAPInt(), DL, VT);
	}
	}
	}

	// Constant fold unary operations with a vector integer or float operand.
	if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Operand)) {
	if (BV->isConstant()) {
	switch (Opcode) {
	default:
	// FIXME: Entirely reasonable to perform folding of other unary
	// operations here as the need arises.
	break;
	case ISD::FNEG:
	case ISD::FABS:
	case ISD::FCEIL:
	case ISD::FTRUNC:
	case ISD::FFLOOR:
	case ISD::FP_EXTEND:
	case ISD::FP_TO_SINT:
	case ISD::FP_TO_UINT:
	case ISD::TRUNCATE:
	case ISD::UINT_TO_FP:
	case ISD::SINT_TO_FP:
	case ISD::ABS:
	case ISD::BITREVERSE:
	case ISD::BSWAP:
	case ISD::CTLZ:
	case ISD::CTLZ_ZERO_UNDEF:
	case ISD::CTTZ:
	case ISD::CTTZ_ZERO_UNDEF:
	case ISD::CTPOP: {
	SDValue Ops = { Operand };
	if (SDValue Fold = FoldConstantVectorArithmetic(Opcode, DL, VT, Ops))
	return Fold;
	}
	}
	}
	}

	unsigned OpOpcode = Operand.getNode()->getOpcode();
	switch (Opcode) {
	case ISD::TokenFactor:
	case ISD::MERGE_VALUES:
	case ISD::CONCAT_VECTORS:
	return Operand; // Factor, merge or concat of one node? No need.
	case ISD::FP_ROUND: llvm_unreachable("Invalid method to make FP_ROUND node");
	case ISD::FP_EXTEND:
	assert(VT.isFloatingPoint() &&
	Operand.getValueType().isFloatingPoint() && "Invalid FP cast!");
	if (Operand.getValueType() == VT) return Operand; // noop conversion.
	assert((!VT.isVector() \|\|
	VT.getVectorNumElements() ==
	Operand.getValueType().getVectorNumElements()) &&
	"Vector element count mismatch!");
	assert(Operand.getValueType().bitsLT(VT) &&
	"Invalid fpext node, dst < src!");
	if (Operand.isUndef())
	return getUNDEF(VT);
	break;
	case ISD::SIGN_EXTEND:
	assert(VT.isInteger() && Operand.getValueType().isInteger() &&
	"Invalid SIGN_EXTEND!");
	if (Operand.getValueType() == VT) return Operand; // noop extension
	assert((!VT.isVector() \|\|
	VT.getVectorNumElements() ==
	Operand.getValueType().getVectorNumElements()) &&
	"Vector element count mismatch!");
	assert(Operand.getValueType().bitsLT(VT) &&
	"Invalid sext node, dst < src!");
	if (OpOpcode == ISD::SIGN_EXTEND \|\| OpOpcode == ISD::ZERO_EXTEND)
	return getNode(OpOpcode, DL, VT, Operand.getOperand(0));
	else if (OpOpcode == ISD::UNDEF)
	// sext(undef) = 0, because the top bits will all be the same.
	return getConstant(0, DL, VT);
	break;
	case ISD::ZERO_EXTEND:
	assert(VT.isInteger() && Operand.getValueType().isInteger() &&
	"Invalid ZERO_EXTEND!");
	if (Operand.getValueType() == VT) return Operand; // noop extension
	assert((!VT.isVector() \|\|
	VT.getVectorNumElements() ==
	Operand.getValueType().getVectorNumElements()) &&
	"Vector element count mismatch!");
	assert(Operand.getValueType().bitsLT(VT) &&
	"Invalid zext node, dst < src!");
	if (OpOpcode == ISD::ZERO_EXTEND) // (zext (zext x)) -> (zext x)
	return getNode(ISD::ZERO_EXTEND, DL, VT, Operand.getOperand(0));
	else if (OpOpcode == ISD::UNDEF)
	// zext(undef) = 0, because the top bits will be zero.
	return getConstant(0, DL, VT);
	break;
	case ISD::ANY_EXTEND:
	assert(VT.isInteger() && Operand.getValueType().isInteger() &&
	"Invalid ANY_EXTEND!");
	if (Operand.getValueType() == VT) return Operand; // noop extension
	assert((!VT.isVector() \|\|
	VT.getVectorNumElements() ==
	Operand.getValueType().getVectorNumElements()) &&
	"Vector element count mismatch!");
	assert(Operand.getValueType().bitsLT(VT) &&
	"Invalid anyext node, dst < src!");

	if (OpOpcode == ISD::ZERO_EXTEND \|\| OpOpcode == ISD::SIGN_EXTEND \|\|
	OpOpcode == ISD::ANY_EXTEND)
	// (ext (zext x)) -> (zext x) and (ext (sext x)) -> (sext x)
	return getNode(OpOpcode, DL, VT, Operand.getOperand(0));
	else if (OpOpcode == ISD::UNDEF)
	return getUNDEF(VT);

	// (ext (trunx x)) -> x
	if (OpOpcode == ISD::TRUNCATE) {
	SDValue OpOp = Operand.getOperand(0);
	if (OpOp.getValueType() == VT)
	return OpOp;
	}
	break;
	case ISD::TRUNCATE:
	assert(VT.isInteger() && Operand.getValueType().isInteger() &&
	"Invalid TRUNCATE!");
	if (Operand.getValueType() == VT) return Operand; // noop truncate
	assert((!VT.isVector() \|\|
	VT.getVectorNumElements() ==
	Operand.getValueType().getVectorNumElements()) &&
	"Vector element count mismatch!");
	assert(Operand.getValueType().bitsGT(VT) &&
	"Invalid truncate node, src < dst!");
	if (OpOpcode == ISD::TRUNCATE)
	return getNode(ISD::TRUNCATE, DL, VT, Operand.getOperand(0));
	if (OpOpcode == ISD::ZERO_EXTEND \|\| OpOpcode == ISD::SIGN_EXTEND \|\|
	OpOpcode == ISD::ANY_EXTEND) {
	// If the source is smaller than the dest, we still need an extend.
	if (Operand.getOperand(0).getValueType().getScalarType()
	.bitsLT(VT.getScalarType()))
	return getNode(OpOpcode, DL, VT, Operand.getOperand(0));
	if (Operand.getOperand(0).getValueType().bitsGT(VT))
	return getNode(ISD::TRUNCATE, DL, VT, Operand.getOperand(0));
	return Operand.getOperand(0);
	}
	if (OpOpcode == ISD::UNDEF)
	return getUNDEF(VT);
	break;
	case ISD::ABS:
	assert(VT.isInteger() && VT == Operand.getValueType() &&
	"Invalid ABS!");
	if (OpOpcode == ISD::UNDEF)
	return getUNDEF(VT);
	break;
	case ISD::BSWAP:
	assert(VT.isInteger() && VT == Operand.getValueType() &&
	"Invalid BSWAP!");
	assert((VT.getScalarSizeInBits() % 16 == 0) &&
	"BSWAP types must be a multiple of 16 bits!");
	if (OpOpcode == ISD::UNDEF)
	return getUNDEF(VT);
	break;
	case ISD::BITREVERSE:
	assert(VT.isInteger() && VT == Operand.getValueType() &&
	"Invalid BITREVERSE!");
	if (OpOpcode == ISD::UNDEF)
	return getUNDEF(VT);
	break;
	case ISD::BITCAST:
	// Basic sanity checking.
	assert(VT.getSizeInBits() == Operand.getValueSizeInBits() &&
	"Cannot BITCAST between types of different sizes!");
	if (VT == Operand.getValueType()) return Operand; // noop conversion.
	if (OpOpcode == ISD::BITCAST) // bitconv(bitconv(x)) -> bitconv(x)
	return getNode(ISD::BITCAST, DL, VT, Operand.getOperand(0));
	if (OpOpcode == ISD::UNDEF)
	return getUNDEF(VT);
	break;
	case ISD::SCALAR_TO_VECTOR:
	assert(VT.isVector() && !Operand.getValueType().isVector() &&
	(VT.getVectorElementType() == Operand.getValueType() \|\|
	(VT.getVectorElementType().isInteger() &&
	Operand.getValueType().isInteger() &&
	VT.getVectorElementType().bitsLE(Operand.getValueType()))) &&
	"Illegal SCALAR_TO_VECTOR node!");
	if (OpOpcode == ISD::UNDEF)
	return getUNDEF(VT);
	// scalar_to_vector(extract_vector_elt V, 0) -> V, top bits are undefined.
	if (OpOpcode == ISD::EXTRACT_VECTOR_ELT &&
	isa<ConstantSDNode>(Operand.getOperand(1)) &&
	Operand.getConstantOperandVal(1) == 0 &&
	Operand.getOperand(0).getValueType() == VT)
	return Operand.getOperand(0);
	break;
	case ISD::FNEG:
	// -(X-Y) -> (Y-X) is unsafe because when X==Y, -0.0 != +0.0
	if (getTarget().Options.UnsafeFPMath && OpOpcode == ISD::FSUB)
	// FIXME: FNEG has no fast-math-flags to propagate; use the FSUB's flags?
	return getNode(ISD::FSUB, DL, VT, Operand.getOperand(1),
	Operand.getOperand(0), Operand.getNode()->getFlags());
	if (OpOpcode == ISD::FNEG) // --X -> X
	return Operand.getOperand(0);
	break;
	case ISD::FABS:
	if (OpOpcode == ISD::FNEG) // abs(-X) -> abs(X)
	return getNode(ISD::FABS, DL, VT, Operand.getOperand(0));
	break;
	}

	SDNode *N;
	SDVTList VTs = getVTList(VT);
	SDValue Ops[] = {Operand};
	if (VT != MVT::Glue) { // Don't CSE flag producing nodes
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, Opcode, VTs, Ops);
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP)) {
	E->intersectFlagsWith(Flags);
	return SDValue(E, 0);
	}

	N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
	N->setFlags(Flags);
	createOperands(N, Ops);
	CSEMap.InsertNode(N, IP);
	} else {
	N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
	createOperands(N, Ops);
	}

	InsertNode(N);
	return SDValue(N, 0);
	}

	static std::pair<APInt, bool> FoldValue(unsigned Opcode, const APInt &C1,
	const APInt &C2) {
	switch (Opcode) {
	case ISD::ADD: return std::make_pair(C1 + C2, true);
	case ISD::SUB: return std::make_pair(C1 - C2, true);
	case ISD::MUL: return std::make_pair(C1 * C2, true);
	case ISD::AND: return std::make_pair(C1 & C2, true);
	case ISD::OR: return std::make_pair(C1 \| C2, true);
	case ISD::XOR: return std::make_pair(C1 ^ C2, true);
	case ISD::SHL: return std::make_pair(C1 << C2, true);
	case ISD::SRL: return std::make_pair(C1.lshr(C2), true);
	case ISD::SRA: return std::make_pair(C1.ashr(C2), true);
	case ISD::ROTL: return std::make_pair(C1.rotl(C2), true);
	case ISD::ROTR: return std::make_pair(C1.rotr(C2), true);
	case ISD::SMIN: return std::make_pair(C1.sle(C2) ? C1 : C2, true);
	case ISD::SMAX: return std::make_pair(C1.sge(C2) ? C1 : C2, true);
	case ISD::UMIN: return std::make_pair(C1.ule(C2) ? C1 : C2, true);
	case ISD::UMAX: return std::make_pair(C1.uge(C2) ? C1 : C2, true);
	case ISD::UDIV:
	if (!C2.getBoolValue())
	break;
	return std::make_pair(C1.udiv(C2), true);
	case ISD::UREM:
	if (!C2.getBoolValue())
	break;
	return std::make_pair(C1.urem(C2), true);
	case ISD::SDIV:
	if (!C2.getBoolValue())
	break;
	return std::make_pair(C1.sdiv(C2), true);
	case ISD::SREM:
	if (!C2.getBoolValue())
	break;
	return std::make_pair(C1.srem(C2), true);
	}
	return std::make_pair(APInt(1, 0), false);
	}

	SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
	EVT VT, const ConstantSDNode *Cst1,
	const ConstantSDNode *Cst2) {
	if (Cst1->isOpaque() \|\| Cst2->isOpaque())
	return SDValue();

	std::pair<APInt, bool> Folded = FoldValue(Opcode, Cst1->getAPIntValue(),
	Cst2->getAPIntValue());
	if (!Folded.second)
	return SDValue();
	return getConstant(Folded.first, DL, VT);
	}

	SDValue SelectionDAG::FoldSymbolOffset(unsigned Opcode, EVT VT,
	const GlobalAddressSDNode *GA,
	const SDNode *N2) {
	if (GA->getOpcode() != ISD::GlobalAddress)
	return SDValue();
	if (!TLI->isOffsetFoldingLegal(GA))
	return SDValue();
	const ConstantSDNode *Cst2 = dyn_cast<ConstantSDNode>(N2);
	if (!Cst2)
	return SDValue();
	int64_t Offset = Cst2->getSExtValue();
	switch (Opcode) {
	case ISD::ADD: break;
	case ISD::SUB: Offset = -uint64_t(Offset); break;
	default: return SDValue();
	}
	return getGlobalAddress(GA->getGlobal(), SDLoc(Cst2), VT,
	GA->getOffset() + uint64_t(Offset));
	}

	bool SelectionDAG::isUndef(unsigned Opcode, ArrayRef<SDValue> Ops) {
	switch (Opcode) {
	case ISD::SDIV:
	case ISD::UDIV:
	case ISD::SREM:
	case ISD::UREM: {
	// If a divisor is zero/undef or any element of a divisor vector is
	// zero/undef, the whole op is undef.
	assert(Ops.size() == 2 && "Div/rem should have 2 operands");
	SDValue Divisor = Ops[1];
	if (Divisor.isUndef() \|\| isNullConstant(Divisor))
	return true;

	return ISD::isBuildVectorOfConstantSDNodes(Divisor.getNode()) &&
	llvm::any_of(Divisor->op_values(),
	[](SDValue V) { return V.isUndef() \|\|
	isNullConstant(V); });
	// TODO: Handle signed overflow.
	}
	// TODO: Handle oversized shifts.
	default:
	return false;
	}
	}

	SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
	EVT VT, SDNode *Cst1,
	SDNode *Cst2) {
	// If the opcode is a target-specific ISD node, there's nothing we can
	// do here and the operand rules may not line up with the below, so
	// bail early.
	if (Opcode >= ISD::BUILTIN_OP_END)
	return SDValue();

	if (isUndef(Opcode, {SDValue(Cst1, 0), SDValue(Cst2, 0)}))
	return getUNDEF(VT);

	// Handle the case of two scalars.
	if (const ConstantSDNode *Scalar1 = dyn_cast<ConstantSDNode>(Cst1)) {
	if (const ConstantSDNode *Scalar2 = dyn_cast<ConstantSDNode>(Cst2)) {
	SDValue Folded = FoldConstantArithmetic(Opcode, DL, VT, Scalar1, Scalar2);
	assert((!Folded \|\| !VT.isVector()) &&
	"Can't fold vectors ops with scalar operands");
	return Folded;
	}
	}

	// fold (add Sym, c) -> Sym+c
	if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Cst1))
	return FoldSymbolOffset(Opcode, VT, GA, Cst2);
	if (TLI->isCommutativeBinOp(Opcode))
	if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Cst2))
	return FoldSymbolOffset(Opcode, VT, GA, Cst1);

	// For vectors extract each constant element into Inputs so we can constant
	// fold them individually.
	BuildVectorSDNode *BV1 = dyn_cast<BuildVectorSDNode>(Cst1);
	BuildVectorSDNode *BV2 = dyn_cast<BuildVectorSDNode>(Cst2);
	if (!BV1 \|\| !BV2)
	return SDValue();

	assert(BV1->getNumOperands() == BV2->getNumOperands() && "Out of sync!");

	EVT SVT = VT.getScalarType();
	SmallVector<SDValue, 4> Outputs;
	for (unsigned I = 0, E = BV1->getNumOperands(); I != E; ++I) {
	SDValue V1 = BV1->getOperand(I);
	SDValue V2 = BV2->getOperand(I);

	// Avoid BUILD_VECTOR nodes that perform implicit truncation.
	// FIXME: This is valid and could be handled by truncation.
	if (V1->getValueType(0) != SVT \|\| V2->getValueType(0) != SVT)
	return SDValue();

	// Fold one vector element.
	SDValue ScalarResult = getNode(Opcode, DL, SVT, V1, V2);

	// Scalar folding only succeeded if the result is a constant or UNDEF.
	if (!ScalarResult.isUndef() && ScalarResult.getOpcode() != ISD::Constant &&
	ScalarResult.getOpcode() != ISD::ConstantFP)
	return SDValue();
	Outputs.push_back(ScalarResult);
	}

	assert(VT.getVectorNumElements() == Outputs.size() &&
	"Vector size mismatch!");

	// We may have a vector type but a scalar result. Create a splat.
	Outputs.resize(VT.getVectorNumElements(), Outputs.back());

	// Build a big vector out of the scalar elements we generated.
	return getBuildVector(VT, SDLoc(), Outputs);
	}

	SDValue SelectionDAG::FoldConstantVectorArithmetic(unsigned Opcode,
	const SDLoc &DL, EVT VT,
	ArrayRef<SDValue> Ops,
	const SDNodeFlags Flags) {
	// If the opcode is a target-specific ISD node, there's nothing we can
	// do here and the operand rules may not line up with the below, so
	// bail early.
	if (Opcode >= ISD::BUILTIN_OP_END)
	return SDValue();

	if (isUndef(Opcode, Ops))
	return getUNDEF(VT);

	// We can only fold vectors - maybe merge with FoldConstantArithmetic someday?
	if (!VT.isVector())
	return SDValue();

	unsigned NumElts = VT.getVectorNumElements();

	auto IsScalarOrSameVectorSize = [&](const SDValue &Op) {
	return !Op.getValueType().isVector() \|\|
	Op.getValueType().getVectorNumElements() == NumElts;
	};

	auto IsConstantBuildVectorOrUndef = [&](const SDValue &Op) {
	BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op);
	return (Op.isUndef()) \|\| (Op.getOpcode() == ISD::CONDCODE) \|\|
	(BV && BV->isConstant());
	};

	// All operands must be vector types with the same number of elements as
	// the result type and must be either UNDEF or a build vector of constant
	// or UNDEF scalars.
	if (!llvm::all_of(Ops, IsConstantBuildVectorOrUndef) \|\|
	!llvm::all_of(Ops, IsScalarOrSameVectorSize))
	return SDValue();

	// If we are comparing vectors, then the result needs to be a i1 boolean
	// that is then sign-extended back to the legal result type.
	EVT SVT = (Opcode == ISD::SETCC ? MVT::i1 : VT.getScalarType());

	// Find legal integer scalar type for constant promotion and
	// ensure that its scalar size is at least as large as source.
	EVT LegalSVT = VT.getScalarType();
	if (NewNodesMustHaveLegalTypes && LegalSVT.isInteger()) {
	LegalSVT = TLI->getTypeToTransformTo(*getContext(), LegalSVT);
	if (LegalSVT.bitsLT(VT.getScalarType()))
	return SDValue();
	}

	// Constant fold each scalar lane separately.
	SmallVector<SDValue, 4> ScalarResults;
	for (unsigned i = 0; i != NumElts; i++) {
	SmallVector<SDValue, 4> ScalarOps;
	for (SDValue Op : Ops) {
	EVT InSVT = Op.getValueType().getScalarType();
	BuildVectorSDNode *InBV = dyn_cast<BuildVectorSDNode>(Op);
	if (!InBV) {
	// We've checked that this is UNDEF or a constant of some kind.
	if (Op.isUndef())
	ScalarOps.push_back(getUNDEF(InSVT));
	else
	ScalarOps.push_back(Op);
	continue;
	}

	SDValue ScalarOp = InBV->getOperand(i);
	EVT ScalarVT = ScalarOp.getValueType();

	// Build vector (integer) scalar operands may need implicit
	// truncation - do this before constant folding.
	if (ScalarVT.isInteger() && ScalarVT.bitsGT(InSVT))
	ScalarOp = getNode(ISD::TRUNCATE, DL, InSVT, ScalarOp);

	ScalarOps.push_back(ScalarOp);
	}

	// Constant fold the scalar operands.
	SDValue ScalarResult = getNode(Opcode, DL, SVT, ScalarOps, Flags);

	// Legalize the (integer) scalar constant if necessary.
	if (LegalSVT != SVT)
	ScalarResult = getNode(ISD::SIGN_EXTEND, DL, LegalSVT, ScalarResult);

	// Scalar folding only succeeded if the result is a constant or UNDEF.
	if (!ScalarResult.isUndef() && ScalarResult.getOpcode() != ISD::Constant &&
	ScalarResult.getOpcode() != ISD::ConstantFP)
	return SDValue();
	ScalarResults.push_back(ScalarResult);
	}

	return getBuildVector(VT, DL, ScalarResults);
	}

	SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
	SDValue N1, SDValue N2, const SDNodeFlags Flags) {
	ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
	ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2);
	ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
	ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2);

	// Canonicalize constant to RHS if commutative.
	if (TLI->isCommutativeBinOp(Opcode)) {
	if (N1C && !N2C) {
	std::swap(N1C, N2C);
	std::swap(N1, N2);
	} else if (N1CFP && !N2CFP) {
	std::swap(N1CFP, N2CFP);
	std::swap(N1, N2);
	}
	}

	switch (Opcode) {
	default: break;
	case ISD::TokenFactor:
	assert(VT == MVT::Other && N1.getValueType() == MVT::Other &&
	N2.getValueType() == MVT::Other && "Invalid token factor!");
	// Fold trivial token factors.
	if (N1.getOpcode() == ISD::EntryToken) return N2;
	if (N2.getOpcode() == ISD::EntryToken) return N1;
	if (N1 == N2) return N1;
	break;
	case ISD::CONCAT_VECTORS: {
	// Attempt to fold CONCAT_VECTORS into BUILD_VECTOR or UNDEF.
	SDValue Ops[] = {N1, N2};
	if (SDValue V = FoldCONCAT_VECTORS(DL, VT, Ops, *this))
	return V;
	break;
	}
	case ISD::AND:
	assert(VT.isInteger() && "This operator does not apply to FP types!");
	assert(N1.getValueType() == N2.getValueType() &&
	N1.getValueType() == VT && "Binary operator types must match!");
	// (X & 0) -> 0. This commonly occurs when legalizing i64 values, so it's
	// worth handling here.
	if (N2C && N2C->isNullValue())
	return N2;
	if (N2C && N2C->isAllOnesValue()) // X & -1 -> X
	return N1;
	break;
	case ISD::OR:
	case ISD::XOR:
	case ISD::ADD:
	case ISD::SUB:
	assert(VT.isInteger() && "This operator does not apply to FP types!");
	assert(N1.getValueType() == N2.getValueType() &&
	N1.getValueType() == VT && "Binary operator types must match!");
	// (X ^\|+- 0) -> X. This commonly occurs when legalizing i64 values, so
	// it's worth handling here.
	if (N2C && N2C->isNullValue())
	return N1;
	break;
	case ISD::UDIV:
	case ISD::UREM:
	case ISD::MULHU:
	case ISD::MULHS:
	case ISD::MUL:
	case ISD::SDIV:
	case ISD::SREM:
	case ISD::SMIN:
	case ISD::SMAX:
	case ISD::UMIN:
	case ISD::UMAX:
	assert(VT.isInteger() && "This operator does not apply to FP types!");
	assert(N1.getValueType() == N2.getValueType() &&
	N1.getValueType() == VT && "Binary operator types must match!");
	break;
	case ISD::FADD:
	case ISD::FSUB:
	case ISD::FMUL:
	case ISD::FDIV:
	case ISD::FREM:
	if (getTarget().Options.UnsafeFPMath) {
	if (Opcode == ISD::FADD) {
	// x+0 --> x
	if (N2CFP && N2CFP->getValueAPF().isZero())
	return N1;
	} else if (Opcode == ISD::FSUB) {
	// x-0 --> x
	if (N2CFP && N2CFP->getValueAPF().isZero())
	return N1;
	} else if (Opcode == ISD::FMUL) {
	// x*0 --> 0
	if (N2CFP && N2CFP->isZero())
	return N2;
	// x*1 --> x
	if (N2CFP && N2CFP->isExactlyValue(1.0))
	return N1;
	}
	}
	assert(VT.isFloatingPoint() && "This operator only applies to FP types!");
	assert(N1.getValueType() == N2.getValueType() &&
	N1.getValueType() == VT && "Binary operator types must match!");
	break;
	case ISD::FCOPYSIGN: // N1 and result must match. N1/N2 need not match.
	assert(N1.getValueType() == VT &&
	N1.getValueType().isFloatingPoint() &&
	N2.getValueType().isFloatingPoint() &&
	"Invalid FCOPYSIGN!");
	break;
	case ISD::SHL:
	case ISD::SRA:
	case ISD::SRL:
	case ISD::ROTL:
	case ISD::ROTR:
	assert(VT == N1.getValueType() &&
	"Shift operators return type must be the same as their first arg");
	assert(VT.isInteger() && N2.getValueType().isInteger() &&
	"Shifts only work on integers");
	assert((!VT.isVector() \|\| VT == N2.getValueType()) &&
	"Vector shift amounts must be in the same as their first arg");
	// Verify that the shift amount VT is bit enough to hold valid shift
	// amounts. This catches things like trying to shift an i1024 value by an
	// i8, which is easy to fall into in generic code that uses
	// TLI.getShiftAmount().
	assert(N2.getValueSizeInBits() >= Log2_32_Ceil(N1.getValueSizeInBits()) &&
	"Invalid use of small shift amount with oversized value!");

	// Always fold shifts of i1 values so the code generator doesn't need to
	// handle them. Since we know the size of the shift has to be less than the
	// size of the value, the shift/rotate count is guaranteed to be zero.
	if (VT == MVT::i1)
	return N1;
	if (N2C && N2C->isNullValue())
	return N1;
	break;
	case ISD::FP_ROUND_INREG: {
	EVT EVT = cast<VTSDNode>(N2)->getVT();
	assert(VT == N1.getValueType() && "Not an inreg round!");
	assert(VT.isFloatingPoint() && EVT.isFloatingPoint() &&
	"Cannot FP_ROUND_INREG integer types");
	assert(EVT.isVector() == VT.isVector() &&
	"FP_ROUND_INREG type should be vector iff the operand "
	"type is vector!");
	assert((!EVT.isVector() \|\|
	EVT.getVectorNumElements() == VT.getVectorNumElements()) &&
	"Vector element counts must match in FP_ROUND_INREG");
	assert(EVT.bitsLE(VT) && "Not rounding down!");
	(void)EVT;
	if (cast<VTSDNode>(N2)->getVT() == VT) return N1; // Not actually rounding.
	break;
	}
	case ISD::FP_ROUND:
	assert(VT.isFloatingPoint() &&
	N1.getValueType().isFloatingPoint() &&
	VT.bitsLE(N1.getValueType()) &&
	N2C && (N2C->getZExtValue() == 0 \|\| N2C->getZExtValue() == 1) &&
	"Invalid FP_ROUND!");
	if (N1.getValueType() == VT) return N1; // noop conversion.
	break;
	case ISD::AssertSext:
	case ISD::AssertZext: {
	EVT EVT = cast<VTSDNode>(N2)->getVT();
	assert(VT == N1.getValueType() && "Not an inreg extend!");
	assert(VT.isInteger() && EVT.isInteger() &&
	"Cannot *_EXTEND_INREG FP types");
	assert(!EVT.isVector() &&
	"AssertSExt/AssertZExt type should be the vector element type "
	"rather than the vector type!");
	assert(EVT.bitsLE(VT) && "Not extending!");
	if (VT == EVT) return N1; // noop assertion.
	break;
	}
	case ISD::SIGN_EXTEND_INREG: {
	EVT EVT = cast<VTSDNode>(N2)->getVT();
	assert(VT == N1.getValueType() && "Not an inreg extend!");
	assert(VT.isInteger() && EVT.isInteger() &&
	"Cannot *_EXTEND_INREG FP types");
	assert(EVT.isVector() == VT.isVector() &&
	"SIGN_EXTEND_INREG type should be vector iff the operand "
	"type is vector!");
	assert((!EVT.isVector() \|\|
	EVT.getVectorNumElements() == VT.getVectorNumElements()) &&
	"Vector element counts must match in SIGN_EXTEND_INREG");
	assert(EVT.bitsLE(VT) && "Not extending!");
	if (EVT == VT) return N1; // Not actually extending

	auto SignExtendInReg = [&](APInt Val, llvm::EVT ConstantVT) {
	unsigned FromBits = EVT.getScalarSizeInBits();
	Val <<= Val.getBitWidth() - FromBits;
	Val.ashrInPlace(Val.getBitWidth() - FromBits);
	return getConstant(Val, DL, ConstantVT);
	};

	if (N1C) {
	const APInt &Val = N1C->getAPIntValue();
	return SignExtendInReg(Val, VT);
	}
	if (ISD::isBuildVectorOfConstantSDNodes(N1.getNode())) {
	SmallVector<SDValue, 8> Ops;
	llvm::EVT OpVT = N1.getOperand(0).getValueType();
	for (int i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
	SDValue Op = N1.getOperand(i);
	if (Op.isUndef()) {
	Ops.push_back(getUNDEF(OpVT));
	continue;
	}
	ConstantSDNode *C = cast<ConstantSDNode>(Op);
	APInt Val = C->getAPIntValue();
	Ops.push_back(SignExtendInReg(Val, OpVT));
	}
	return getBuildVector(VT, DL, Ops);
	}
	break;
	}
	case ISD::EXTRACT_VECTOR_ELT:
	// EXTRACT_VECTOR_ELT of an UNDEF is an UNDEF.
	if (N1.isUndef())
	return getUNDEF(VT);

	// EXTRACT_VECTOR_ELT of out-of-bounds element is an UNDEF
	if (N2C && N2C->getZExtValue() >= N1.getValueType().getVectorNumElements())
	return getUNDEF(VT);

	// EXTRACT_VECTOR_ELT of CONCAT_VECTORS is often formed while lowering is
	// expanding copies of large vectors from registers.
	if (N2C &&
	N1.getOpcode() == ISD::CONCAT_VECTORS &&
	N1.getNumOperands() > 0) {
	unsigned Factor =
	N1.getOperand(0).getValueType().getVectorNumElements();
	return getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
	N1.getOperand(N2C->getZExtValue() / Factor),
	getConstant(N2C->getZExtValue() % Factor, DL,
	N2.getValueType()));
	}

	// EXTRACT_VECTOR_ELT of BUILD_VECTOR is often formed while lowering is
	// expanding large vector constants.
	if (N2C && N1.getOpcode() == ISD::BUILD_VECTOR) {
	SDValue Elt = N1.getOperand(N2C->getZExtValue());

	if (VT != Elt.getValueType())
	// If the vector element type is not legal, the BUILD_VECTOR operands
	// are promoted and implicitly truncated, and the result implicitly
	// extended. Make that explicit here.
	Elt = getAnyExtOrTrunc(Elt, DL, VT);

	return Elt;
	}

	// EXTRACT_VECTOR_ELT of INSERT_VECTOR_ELT is often formed when vector
	// operations are lowered to scalars.
	if (N1.getOpcode() == ISD::INSERT_VECTOR_ELT) {
	// If the indices are the same, return the inserted element else
	// if the indices are known different, extract the element from
	// the original vector.
	SDValue N1Op2 = N1.getOperand(2);
	ConstantSDNode *N1Op2C = dyn_cast<ConstantSDNode>(N1Op2);

	if (N1Op2C && N2C) {
	if (N1Op2C->getZExtValue() == N2C->getZExtValue()) {
	if (VT == N1.getOperand(1).getValueType())
	return N1.getOperand(1);
	else
	return getSExtOrTrunc(N1.getOperand(1), DL, VT);
	}

	return getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, N1.getOperand(0), N2);
	}
	}
	break;
	case ISD::EXTRACT_ELEMENT:
	assert(N2C && (unsigned)N2C->getZExtValue() < 2 && "Bad EXTRACT_ELEMENT!");
	assert(!N1.getValueType().isVector() && !VT.isVector() &&
	(N1.getValueType().isInteger() == VT.isInteger()) &&
	N1.getValueType() != VT &&
	"Wrong types for EXTRACT_ELEMENT!");

	// EXTRACT_ELEMENT of BUILD_PAIR is often formed while legalize is expanding
	// 64-bit integers into 32-bit parts. Instead of building the extract of
	// the BUILD_PAIR, only to have legalize rip it apart, just do it now.
	if (N1.getOpcode() == ISD::BUILD_PAIR)
	return N1.getOperand(N2C->getZExtValue());

	// EXTRACT_ELEMENT of a constant int is also very common.
	if (N1C) {
	unsigned ElementSize = VT.getSizeInBits();
	unsigned Shift = ElementSize * N2C->getZExtValue();
	APInt ShiftedVal = N1C->getAPIntValue().lshr(Shift);
	return getConstant(ShiftedVal.trunc(ElementSize), DL, VT);
	}
	break;
	case ISD::EXTRACT_SUBVECTOR:
	if (VT.isSimple() && N1.getValueType().isSimple()) {
	assert(VT.isVector() && N1.getValueType().isVector() &&
	"Extract subvector VTs must be a vectors!");
	assert(VT.getVectorElementType() ==
	N1.getValueType().getVectorElementType() &&
	"Extract subvector VTs must have the same element type!");
	assert(VT.getSimpleVT() <= N1.getSimpleValueType() &&
	"Extract subvector must be from larger vector to smaller vector!");

	if (N2C) {
	assert((VT.getVectorNumElements() + N2C->getZExtValue()
	<= N1.getValueType().getVectorNumElements())
	&& "Extract subvector overflow!");
	}

	// Trivial extraction.
	if (VT.getSimpleVT() == N1.getSimpleValueType())
	return N1;

	// EXTRACT_SUBVECTOR of an UNDEF is an UNDEF.
	if (N1.isUndef())
	return getUNDEF(VT);

	// EXTRACT_SUBVECTOR of CONCAT_VECTOR can be simplified if the pieces of
	// the concat have the same type as the extract.
	if (N2C && N1.getOpcode() == ISD::CONCAT_VECTORS &&
	N1.getNumOperands() > 0 &&
	VT == N1.getOperand(0).getValueType()) {
	unsigned Factor = VT.getVectorNumElements();
	return N1.getOperand(N2C->getZExtValue() / Factor);
	}

	// EXTRACT_SUBVECTOR of INSERT_SUBVECTOR is often created
	// during shuffle legalization.
	if (N1.getOpcode() == ISD::INSERT_SUBVECTOR && N2 == N1.getOperand(2) &&
	VT == N1.getOperand(1).getValueType())
	return N1.getOperand(1);
	}
	break;
	}

	// Perform trivial constant folding.
	if (SDValue SV =
	FoldConstantArithmetic(Opcode, DL, VT, N1.getNode(), N2.getNode()))
	return SV;

	// Constant fold FP operations.
	bool HasFPExceptions = TLI->hasFloatingPointExceptions();
	if (N1CFP) {
	if (N2CFP) {
	APFloat V1 = N1CFP->getValueAPF(), V2 = N2CFP->getValueAPF();
	APFloat::opStatus s;
	switch (Opcode) {
	case ISD::FADD:
	s = V1.add(V2, APFloat::rmNearestTiesToEven);
	if (!HasFPExceptions \|\| s != APFloat::opInvalidOp)
	return getConstantFP(V1, DL, VT);
	break;
	case ISD::FSUB:
	s = V1.subtract(V2, APFloat::rmNearestTiesToEven);
	if (!HasFPExceptions \|\| s!=APFloat::opInvalidOp)
	return getConstantFP(V1, DL, VT);
	break;
	case ISD::FMUL:
	s = V1.multiply(V2, APFloat::rmNearestTiesToEven);
	if (!HasFPExceptions \|\| s!=APFloat::opInvalidOp)
	return getConstantFP(V1, DL, VT);
	break;
	case ISD::FDIV:
	s = V1.divide(V2, APFloat::rmNearestTiesToEven);
	if (!HasFPExceptions \|\| (s!=APFloat::opInvalidOp &&
	s!=APFloat::opDivByZero)) {
	return getConstantFP(V1, DL, VT);
	}
	break;
	case ISD::FREM :
	s = V1.mod(V2);
	if (!HasFPExceptions \|\| (s!=APFloat::opInvalidOp &&
	s!=APFloat::opDivByZero)) {
	return getConstantFP(V1, DL, VT);
	}
	break;
	case ISD::FCOPYSIGN:
	V1.copySign(V2);
	return getConstantFP(V1, DL, VT);
	default: break;
	}
	}

	if (Opcode == ISD::FP_ROUND) {
	APFloat V = N1CFP->getValueAPF(); // make copy
	bool ignored;
	// This can return overflow, underflow, or inexact; we don't care.
	// FIXME need to be more flexible about rounding mode.
	(void)V.convert(EVTToAPFloatSemantics(VT),
	APFloat::rmNearestTiesToEven, &ignored);
	return getConstantFP(V, DL, VT);
	}
	}

	// Canonicalize an UNDEF to the RHS, even over a constant.
	if (N1.isUndef()) {
	if (TLI->isCommutativeBinOp(Opcode)) {
	std::swap(N1, N2);
	} else {
	switch (Opcode) {
	case ISD::FP_ROUND_INREG:
	case ISD::SIGN_EXTEND_INREG:
	case ISD::SUB:
	case ISD::FSUB:
	case ISD::FDIV:
	case ISD::FREM:
	case ISD::SRA:
	return N1; // fold op(undef, arg2) -> undef
	case ISD::UDIV:
	case ISD::SDIV:
	case ISD::UREM:
	case ISD::SREM:
	case ISD::SRL:
	case ISD::SHL:
	if (!VT.isVector())
	return getConstant(0, DL, VT); // fold op(undef, arg2) -> 0
	// For vectors, we can't easily build an all zero vector, just return
	// the LHS.
	return N2;
	}
	}
	}

	// Fold a bunch of operators when the RHS is undef.
	if (N2.isUndef()) {
	switch (Opcode) {
	case ISD::XOR:
	if (N1.isUndef())
	// Handle undef ^ undef -> 0 special case. This is a common
	// idiom (misuse).
	return getConstant(0, DL, VT);
	LLVM_FALLTHROUGH;
	case ISD::ADD:
	case ISD::ADDC:
	case ISD::ADDE:
	case ISD::SUB:
	case ISD::UDIV:
	case ISD::SDIV:
	case ISD::UREM:
	case ISD::SREM:
	return N2; // fold op(arg1, undef) -> undef
	case ISD::FADD:
	case ISD::FSUB:
	case ISD::FMUL:
	case ISD::FDIV:
	case ISD::FREM:
	if (getTarget().Options.UnsafeFPMath)
	return N2;
	break;
	case ISD::MUL:
	case ISD::AND:
	case ISD::SRL:
	case ISD::SHL:
	if (!VT.isVector())
	return getConstant(0, DL, VT); // fold op(arg1, undef) -> 0
	// For vectors, we can't easily build an all zero vector, just return
	// the LHS.
	return N1;
	case ISD::OR:
	if (!VT.isVector())
	return getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), DL, VT);
	// For vectors, we can't easily build an all one vector, just return
	// the LHS.
	return N1;
	case ISD::SRA:
	return N1;
	}
	}

	// Memoize this node if possible.
	SDNode *N;
	SDVTList VTs = getVTList(VT);
	SDValue Ops[] = {N1, N2};
	if (VT != MVT::Glue) {
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, Opcode, VTs, Ops);
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP)) {
	E->intersectFlagsWith(Flags);
	return SDValue(E, 0);
	}

	N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
	N->setFlags(Flags);
	createOperands(N, Ops);
	CSEMap.InsertNode(N, IP);
	} else {
	N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
	createOperands(N, Ops);
	}

	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
	SDValue N1, SDValue N2, SDValue N3) {
	// Perform various simplifications.
	switch (Opcode) {
	case ISD::FMA: {
	ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
	ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2);
	ConstantFPSDNode *N3CFP = dyn_cast<ConstantFPSDNode>(N3);
	if (N1CFP && N2CFP && N3CFP) {
	APFloat V1 = N1CFP->getValueAPF();
	const APFloat &V2 = N2CFP->getValueAPF();
	const APFloat &V3 = N3CFP->getValueAPF();
	APFloat::opStatus s =
	V1.fusedMultiplyAdd(V2, V3, APFloat::rmNearestTiesToEven);
	if (!TLI->hasFloatingPointExceptions() \|\| s != APFloat::opInvalidOp)
	return getConstantFP(V1, DL, VT);
	}
	break;
	}
	case ISD::CONCAT_VECTORS: {
	// Attempt to fold CONCAT_VECTORS into BUILD_VECTOR or UNDEF.
	SDValue Ops[] = {N1, N2, N3};
	if (SDValue V = FoldCONCAT_VECTORS(DL, VT, Ops, *this))
	return V;
	break;
	}
	case ISD::SETCC: {
	// Use FoldSetCC to simplify SETCC's.
	if (SDValue V = FoldSetCC(VT, N1, N2, cast<CondCodeSDNode>(N3)->get(), DL))
	return V;
	// Vector constant folding.
	SDValue Ops[] = {N1, N2, N3};
	if (SDValue V = FoldConstantVectorArithmetic(Opcode, DL, VT, Ops))
	return V;
	break;
	}
	case ISD::SELECT:
	if (ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1)) {
	if (N1C->getZExtValue())
	return N2; // select true, X, Y -> X
	return N3; // select false, X, Y -> Y
	}

	if (N2 == N3) return N2; // select C, X, X -> X
	break;
	case ISD::VECTOR_SHUFFLE:
	llvm_unreachable("should use getVectorShuffle constructor!");
	case ISD::INSERT_VECTOR_ELT: {
	ConstantSDNode *N3C = dyn_cast<ConstantSDNode>(N3);
	// INSERT_VECTOR_ELT into out-of-bounds element is an UNDEF
	if (N3C && N3C->getZExtValue() >= N1.getValueType().getVectorNumElements())
	return getUNDEF(VT);
	break;
	}
	case ISD::INSERT_SUBVECTOR: {
	SDValue Index = N3;
	if (VT.isSimple() && N1.getValueType().isSimple()
	&& N2.getValueType().isSimple()) {
	assert(VT.isVector() && N1.getValueType().isVector() &&
	N2.getValueType().isVector() &&
	"Insert subvector VTs must be a vectors");
	assert(VT == N1.getValueType() &&
	"Dest and insert subvector source types must match!");
	assert(N2.getSimpleValueType() <= N1.getSimpleValueType() &&
	"Insert subvector must be from smaller vector to larger vector!");
	if (isa<ConstantSDNode>(Index)) {
	assert((N2.getValueType().getVectorNumElements() +
	cast<ConstantSDNode>(Index)->getZExtValue()
	<= VT.getVectorNumElements())
	&& "Insert subvector overflow!");
	}

	// Trivial insertion.
	if (VT.getSimpleVT() == N2.getSimpleValueType())
	return N2;
	}
	break;
	}
	case ISD::BITCAST:
	// Fold bit_convert nodes from a type to themselves.
	if (N1.getValueType() == VT)
	return N1;
	break;
	}

	// Memoize node if it doesn't produce a flag.
	SDNode *N;
	SDVTList VTs = getVTList(VT);
	SDValue Ops[] = {N1, N2, N3};
	if (VT != MVT::Glue) {
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, Opcode, VTs, Ops);
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP))
	return SDValue(E, 0);

	N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
	createOperands(N, Ops);
	CSEMap.InsertNode(N, IP);
	} else {
	N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
	createOperands(N, Ops);
	}

	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
	SDValue N1, SDValue N2, SDValue N3, SDValue N4) {
	SDValue Ops[] = { N1, N2, N3, N4 };
	return getNode(Opcode, DL, VT, Ops);
	}

	SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
	SDValue N1, SDValue N2, SDValue N3, SDValue N4,
	SDValue N5) {
	SDValue Ops[] = { N1, N2, N3, N4, N5 };
	return getNode(Opcode, DL, VT, Ops);
	}

	/// getStackArgumentTokenFactor - Compute a TokenFactor to force all
	/// the incoming stack arguments to be loaded from the stack.
	SDValue SelectionDAG::getStackArgumentTokenFactor(SDValue Chain) {
	SmallVector<SDValue, 8> ArgChains;

	// Include the original chain at the beginning of the list. When this is
	// used by target LowerCall hooks, this helps legalize find the
	// CALLSEQ_BEGIN node.
	ArgChains.push_back(Chain);

	// Add a chain value for each stack argument.
	for (SDNode::use_iterator U = getEntryNode().getNode()->use_begin(),
	UE = getEntryNode().getNode()->use_end(); U != UE; ++U)
	if (LoadSDNode L = dyn_cast<LoadSDNode>(U))
	if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
	if (FI->getIndex() < 0)
	ArgChains.push_back(SDValue(L, 1));

	// Build a tokenfactor for all the chains.
	return getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
	}

	/// getMemsetValue - Vectorized representation of the memset value
	/// operand.
	static SDValue getMemsetValue(SDValue Value, EVT VT, SelectionDAG &DAG,
	const SDLoc &dl) {
	assert(!Value.isUndef());

	unsigned NumBits = VT.getScalarSizeInBits();
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Value)) {
	assert(C->getAPIntValue().getBitWidth() == 8);
	APInt Val = APInt::getSplat(NumBits, C->getAPIntValue());
	if (VT.isInteger())
	return DAG.getConstant(Val, dl, VT);
	return DAG.getConstantFP(APFloat(DAG.EVTToAPFloatSemantics(VT), Val), dl,
	VT);
	}

	assert(Value.getValueType() == MVT::i8 && "memset with non-byte fill value?");
	EVT IntVT = VT.getScalarType();
	if (!IntVT.isInteger())
	IntVT = EVT::getIntegerVT(*DAG.getContext(), IntVT.getSizeInBits());

	Value = DAG.getNode(ISD::ZERO_EXTEND, dl, IntVT, Value);
	if (NumBits > 8) {
	// Use a multiplication with 0x010101... to extend the input to the
	// required length.
	APInt Magic = APInt::getSplat(NumBits, APInt(8, 0x01));
	Value = DAG.getNode(ISD::MUL, dl, IntVT, Value,
	DAG.getConstant(Magic, dl, IntVT));
	}

	if (VT != Value.getValueType() && !VT.isInteger())
	Value = DAG.getBitcast(VT.getScalarType(), Value);
	if (VT != Value.getValueType())
	Value = DAG.getSplatBuildVector(VT, dl, Value);

	return Value;
	}

	/// getMemsetStringVal - Similar to getMemsetValue. Except this is only
	/// used when a memcpy is turned into a memset when the source is a constant
	/// string ptr.
	static SDValue getMemsetStringVal(EVT VT, const SDLoc &dl, SelectionDAG &DAG,
	const TargetLowering &TLI,
	const ConstantDataArraySlice &Slice) {
	// Handle vector with all elements zero.
	if (Slice.Array == nullptr) {
	if (VT.isInteger())
	return DAG.getConstant(0, dl, VT);
	else if (VT == MVT::f32 \|\| VT == MVT::f64 \|\| VT == MVT::f128)
	return DAG.getConstantFP(0.0, dl, VT);
	else if (VT.isVector()) {
	unsigned NumElts = VT.getVectorNumElements();
	MVT EltVT = (VT.getVectorElementType() == MVT::f32) ? MVT::i32 : MVT::i64;
	return DAG.getNode(ISD::BITCAST, dl, VT,
	DAG.getConstant(0, dl,
	EVT::getVectorVT(*DAG.getContext(),
	EltVT, NumElts)));
	} else
	llvm_unreachable("Expected type!");
	}

	assert(!VT.isVector() && "Can't handle vector type here!");
	unsigned NumVTBits = VT.getSizeInBits();
	unsigned NumVTBytes = NumVTBits / 8;
	unsigned NumBytes = std::min(NumVTBytes, unsigned(Slice.Length));

	APInt Val(NumVTBits, 0);
	if (DAG.getDataLayout().isLittleEndian()) {
	for (unsigned i = 0; i != NumBytes; ++i)
	Val \|= (uint64_t)(unsigned char)Slice[i] << i*8;
	} else {
	for (unsigned i = 0; i != NumBytes; ++i)
	Val \|= (uint64_t)(unsigned char)Slice[i] << (NumVTBytes-i-1)*8;
	}

	// If the "cost" of materializing the integer immediate is less than the cost
	// of a load, then it is cost effective to turn the load into the immediate.
	Type Ty = VT.getTypeForEVT(DAG.getContext());
	if (TLI.shouldConvertConstantLoadToIntImm(Val, Ty))
	return DAG.getConstant(Val, dl, VT);
	return SDValue(nullptr, 0);
	}

	SDValue SelectionDAG::getMemBasePlusOffset(SDValue Base, unsigned Offset,
	const SDLoc &DL) {
	EVT VT = Base.getValueType();
	return getNode(ISD::ADD, DL, VT, Base, getConstant(Offset, DL, VT));
	}

	/// Returns true if memcpy source is constant data.
	static bool isMemSrcFromConstant(SDValue Src, ConstantDataArraySlice &Slice) {
	uint64_t SrcDelta = 0;
	GlobalAddressSDNode *G = nullptr;
	if (Src.getOpcode() == ISD::GlobalAddress)
	G = cast<GlobalAddressSDNode>(Src);
	else if (Src.getOpcode() == ISD::ADD &&
	Src.getOperand(0).getOpcode() == ISD::GlobalAddress &&
	Src.getOperand(1).getOpcode() == ISD::Constant) {
	G = cast<GlobalAddressSDNode>(Src.getOperand(0));
	SrcDelta = cast<ConstantSDNode>(Src.getOperand(1))->getZExtValue();
	}
	if (!G)
	return false;

	return getConstantDataArrayInfo(G->getGlobal(), Slice, 8,
	SrcDelta + G->getOffset());
	}

	/// Determines the optimal series of memory ops to replace the memset / memcpy.
	/// Return true if the number of memory ops is below the threshold (Limit).
	/// It returns the types of the sequence of memory ops to perform
	/// memset / memcpy by reference.
	static bool FindOptimalMemOpLowering(std::vector<EVT> &MemOps,
	unsigned Limit, uint64_t Size,
	unsigned DstAlign, unsigned SrcAlign,
	bool IsMemset,
	bool ZeroMemset,
	bool MemcpyStrSrc,
	bool AllowOverlap,
	unsigned DstAS, unsigned SrcAS,
	SelectionDAG &DAG,
	const TargetLowering &TLI) {
	assert((SrcAlign == 0 \|\| SrcAlign >= DstAlign) &&
	"Expecting memcpy / memset source to meet alignment requirement!");
	// If 'SrcAlign' is zero, that means the memory operation does not need to
	// load the value, i.e. memset or memcpy from constant string. Otherwise,
	// it's the inferred alignment of the source. 'DstAlign', on the other hand,
	// is the specified alignment of the memory operation. If it is zero, that
	// means it's possible to change the alignment of the destination.
	// 'MemcpyStrSrc' indicates whether the memcpy source is constant so it does
	// not need to be loaded.
	EVT VT = TLI.getOptimalMemOpType(Size, DstAlign, SrcAlign,
	IsMemset, ZeroMemset, MemcpyStrSrc,
	DAG.getMachineFunction());

	if (VT == MVT::Other) {
	// Use the largest integer type whose alignment constraints are satisfied.
	// We only need to check DstAlign here as SrcAlign is always greater or
	// equal to DstAlign (or zero).
	VT = MVT::i64;
	while (DstAlign && DstAlign < VT.getSizeInBits() / 8 &&
	!TLI.allowsMisalignedMemoryAccesses(VT, DstAS, DstAlign))
	VT = (MVT::SimpleValueType)(VT.getSimpleVT().SimpleTy - 1);
	assert(VT.isInteger());

	// Find the largest legal integer type.
	MVT LVT = MVT::i64;
	while (!TLI.isTypeLegal(LVT))
	LVT = (MVT::SimpleValueType)(LVT.SimpleTy - 1);
	assert(LVT.isInteger());

	// If the type we've chosen is larger than the largest legal integer type
	// then use that instead.
	if (VT.bitsGT(LVT))
	VT = LVT;
	}

	unsigned NumMemOps = 0;
	while (Size != 0) {
	unsigned VTSize = VT.getSizeInBits() / 8;
	while (VTSize > Size) {
	// For now, only use non-vector load / store's for the left-over pieces.
	EVT NewVT = VT;
	unsigned NewVTSize;

	bool Found = false;
	if (VT.isVector() \|\| VT.isFloatingPoint()) {
	NewVT = (VT.getSizeInBits() > 64) ? MVT::i64 : MVT::i32;
	if (TLI.isOperationLegalOrCustom(ISD::STORE, NewVT) &&
	TLI.isSafeMemOpType(NewVT.getSimpleVT()))
	Found = true;
	else if (NewVT == MVT::i64 &&
	TLI.isOperationLegalOrCustom(ISD::STORE, MVT::f64) &&
	TLI.isSafeMemOpType(MVT::f64)) {
	// i64 is usually not legal on 32-bit targets, but f64 may be.
	NewVT = MVT::f64;
	Found = true;
	}
	}

	if (!Found) {
	do {
	NewVT = (MVT::SimpleValueType)(NewVT.getSimpleVT().SimpleTy - 1);
	if (NewVT == MVT::i8)
	break;
	} while (!TLI.isSafeMemOpType(NewVT.getSimpleVT()));
	}
	NewVTSize = NewVT.getSizeInBits() / 8;

	// If the new VT cannot cover all of the remaining bits, then consider
	// issuing a (or a pair of) unaligned and overlapping load / store.
	// FIXME: Only does this for 64-bit or more since we don't have proper
	// cost model for unaligned load / store.
	bool Fast;
	if (NumMemOps && AllowOverlap &&
	VTSize >= 8 && NewVTSize < Size &&
	TLI.allowsMisalignedMemoryAccesses(VT, DstAS, DstAlign, &Fast) && Fast)
	VTSize = Size;
	else {
	VT = NewVT;
	VTSize = NewVTSize;
	}
	}

	if (++NumMemOps > Limit)
	return false;

	MemOps.push_back(VT);
	Size -= VTSize;
	}

	return true;
	}

	static bool shouldLowerMemFuncForSize(const MachineFunction &MF) {
	// On Darwin, -Os means optimize for size without hurting performance, so
	// only really optimize for size when -Oz (MinSize) is used.
	if (MF.getTarget().getTargetTriple().isOSDarwin())
	return MF.getFunction()->optForMinSize();
	return MF.getFunction()->optForSize();
	}

	static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
	SDValue Chain, SDValue Dst, SDValue Src,
	uint64_t Size, unsigned Align,
	bool isVol, bool AlwaysInline,
	MachinePointerInfo DstPtrInfo,
	MachinePointerInfo SrcPtrInfo) {
	// Turn a memcpy of undef to nop.
	if (Src.isUndef())
	return Chain;

	// Expand memcpy to a series of load and store ops if the size operand falls
	// below a certain threshold.
	// TODO: In the AlwaysInline case, if the size is big then generate a loop
	// rather than maybe a humongous number of loads and stores.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	const DataLayout &DL = DAG.getDataLayout();
	LLVMContext &C = *DAG.getContext();
	std::vector<EVT> MemOps;
	bool DstAlignCanChange = false;
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	bool OptSize = shouldLowerMemFuncForSize(MF);
	FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst);
	if (FI && !MFI.isFixedObjectIndex(FI->getIndex()))
	DstAlignCanChange = true;
	unsigned SrcAlign = DAG.InferPtrAlignment(Src);
	if (Align > SrcAlign)
	SrcAlign = Align;
	ConstantDataArraySlice Slice;
	bool CopyFromConstant = isMemSrcFromConstant(Src, Slice);
	bool isZeroConstant = CopyFromConstant && Slice.Array == nullptr;
	unsigned Limit = AlwaysInline ? ~0U : TLI.getMaxStoresPerMemcpy(OptSize);

	if (!FindOptimalMemOpLowering(MemOps, Limit, Size,
	(DstAlignCanChange ? 0 : Align),
	(isZeroConstant ? 0 : SrcAlign),
	false, false, CopyFromConstant, true,
	DstPtrInfo.getAddrSpace(),
	SrcPtrInfo.getAddrSpace(),
	DAG, TLI))
	return SDValue();

	if (DstAlignCanChange) {
	Type *Ty = MemOps[0].getTypeForEVT(C);
	unsigned NewAlign = (unsigned)DL.getABITypeAlignment(Ty);

	// Don't promote to an alignment that would require dynamic stack
	// realignment.
	const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
	if (!TRI->needsStackRealignment(MF))
	while (NewAlign > Align &&
	DL.exceedsNaturalStackAlignment(NewAlign))
	NewAlign /= 2;

	if (NewAlign > Align) {
	// Give the stack frame object a larger alignment if needed.
	if (MFI.getObjectAlignment(FI->getIndex()) < NewAlign)
	MFI.setObjectAlignment(FI->getIndex(), NewAlign);
	Align = NewAlign;
	}
	}

	MachineMemOperand::Flags MMOFlags =
	isVol ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone;
	SmallVector<SDValue, 8> OutChains;
	unsigned NumMemOps = MemOps.size();
	uint64_t SrcOff = 0, DstOff = 0;
	for (unsigned i = 0; i != NumMemOps; ++i) {
	EVT VT = MemOps[i];
	unsigned VTSize = VT.getSizeInBits() / 8;
	SDValue Value, Store;

	if (VTSize > Size) {
	// Issuing an unaligned load / store pair that overlaps with the previous
	// pair. Adjust the offset accordingly.
	assert(i == NumMemOps-1 && i != 0);
	SrcOff -= VTSize - Size;
	DstOff -= VTSize - Size;
	}

	if (CopyFromConstant &&
	(isZeroConstant \|\| (VT.isInteger() && !VT.isVector()))) {
	// It's unlikely a store of a vector immediate can be done in a single
	// instruction. It would require a load from a constantpool first.
	// We only handle zero vectors here.
	// FIXME: Handle other cases where store of vector immediate is done in
	// a single instruction.
	ConstantDataArraySlice SubSlice;
	if (SrcOff < Slice.Length) {
	SubSlice = Slice;
	SubSlice.move(SrcOff);
	} else {
	// This is an out-of-bounds access and hence UB. Pretend we read zero.
	SubSlice.Array = nullptr;
	SubSlice.Offset = 0;
	SubSlice.Length = VTSize;
	}
	Value = getMemsetStringVal(VT, dl, DAG, TLI, SubSlice);
	if (Value.getNode())
	Store = DAG.getStore(Chain, dl, Value,
	DAG.getMemBasePlusOffset(Dst, DstOff, dl),
	DstPtrInfo.getWithOffset(DstOff), Align,
	MMOFlags);
	}

	if (!Store.getNode()) {
	// The type might not be legal for the target. This should only happen
	// if the type is smaller than a legal type, as on PPC, so the right
	// thing to do is generate a LoadExt/StoreTrunc pair. These simplify
	// to Load/Store if NVT==VT.
	// FIXME does the case above also need this?
	EVT NVT = TLI.getTypeToTransformTo(C, VT);
	assert(NVT.bitsGE(VT));

	bool isDereferenceable =
	SrcPtrInfo.getWithOffset(SrcOff).isDereferenceable(VTSize, C, DL);
	MachineMemOperand::Flags SrcMMOFlags = MMOFlags;
	if (isDereferenceable)
	SrcMMOFlags \|= MachineMemOperand::MODereferenceable;

	Value = DAG.getExtLoad(ISD::EXTLOAD, dl, NVT, Chain,
	DAG.getMemBasePlusOffset(Src, SrcOff, dl),
	SrcPtrInfo.getWithOffset(SrcOff), VT,
	MinAlign(SrcAlign, SrcOff), SrcMMOFlags);
	OutChains.push_back(Value.getValue(1));
	Store = DAG.getTruncStore(
	Chain, dl, Value, DAG.getMemBasePlusOffset(Dst, DstOff, dl),
	DstPtrInfo.getWithOffset(DstOff), VT, Align, MMOFlags);
	}
	OutChains.push_back(Store);
	SrcOff += VTSize;
	DstOff += VTSize;
	Size -= VTSize;
	}

	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
	}

	static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
	SDValue Chain, SDValue Dst, SDValue Src,
	uint64_t Size, unsigned Align,
	bool isVol, bool AlwaysInline,
	MachinePointerInfo DstPtrInfo,
	MachinePointerInfo SrcPtrInfo) {
	// Turn a memmove of undef to nop.
	if (Src.isUndef())
	return Chain;

	// Expand memmove to a series of load and store ops if the size operand falls
	// below a certain threshold.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	const DataLayout &DL = DAG.getDataLayout();
	LLVMContext &C = *DAG.getContext();
	std::vector<EVT> MemOps;
	bool DstAlignCanChange = false;
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	bool OptSize = shouldLowerMemFuncForSize(MF);
	FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst);
	if (FI && !MFI.isFixedObjectIndex(FI->getIndex()))
	DstAlignCanChange = true;
	unsigned SrcAlign = DAG.InferPtrAlignment(Src);
	if (Align > SrcAlign)
	SrcAlign = Align;
	unsigned Limit = AlwaysInline ? ~0U : TLI.getMaxStoresPerMemmove(OptSize);

	if (!FindOptimalMemOpLowering(MemOps, Limit, Size,
	(DstAlignCanChange ? 0 : Align), SrcAlign,
	false, false, false, false,
	DstPtrInfo.getAddrSpace(),
	SrcPtrInfo.getAddrSpace(),
	DAG, TLI))
	return SDValue();

	if (DstAlignCanChange) {
	Type *Ty = MemOps[0].getTypeForEVT(C);
	unsigned NewAlign = (unsigned)DL.getABITypeAlignment(Ty);
	if (NewAlign > Align) {
	// Give the stack frame object a larger alignment if needed.
	if (MFI.getObjectAlignment(FI->getIndex()) < NewAlign)
	MFI.setObjectAlignment(FI->getIndex(), NewAlign);
	Align = NewAlign;
	}
	}

	MachineMemOperand::Flags MMOFlags =
	isVol ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone;
	uint64_t SrcOff = 0, DstOff = 0;
	SmallVector<SDValue, 8> LoadValues;
	SmallVector<SDValue, 8> LoadChains;
	SmallVector<SDValue, 8> OutChains;
	unsigned NumMemOps = MemOps.size();
	for (unsigned i = 0; i < NumMemOps; i++) {
	EVT VT = MemOps[i];
	unsigned VTSize = VT.getSizeInBits() / 8;
	SDValue Value;

	bool isDereferenceable =
	SrcPtrInfo.getWithOffset(SrcOff).isDereferenceable(VTSize, C, DL);
	MachineMemOperand::Flags SrcMMOFlags = MMOFlags;
	if (isDereferenceable)
	SrcMMOFlags \|= MachineMemOperand::MODereferenceable;

	Value =
	DAG.getLoad(VT, dl, Chain, DAG.getMemBasePlusOffset(Src, SrcOff, dl),
	SrcPtrInfo.getWithOffset(SrcOff), SrcAlign, SrcMMOFlags);
	LoadValues.push_back(Value);
	LoadChains.push_back(Value.getValue(1));
	SrcOff += VTSize;
	}
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains);
	OutChains.clear();
	for (unsigned i = 0; i < NumMemOps; i++) {
	EVT VT = MemOps[i];
	unsigned VTSize = VT.getSizeInBits() / 8;
	SDValue Store;

	Store = DAG.getStore(Chain, dl, LoadValues[i],
	DAG.getMemBasePlusOffset(Dst, DstOff, dl),
	DstPtrInfo.getWithOffset(DstOff), Align, MMOFlags);
	OutChains.push_back(Store);
	DstOff += VTSize;
	}

	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
	}

	/// \brief Lower the call to 'memset' intrinsic function into a series of store
	/// operations.
	///
	/// \param DAG Selection DAG where lowered code is placed.
	/// \param dl Link to corresponding IR location.
	/// \param Chain Control flow dependency.
	/// \param Dst Pointer to destination memory location.
	/// \param Src Value of byte to write into the memory.
	/// \param Size Number of bytes to write.
	/// \param Align Alignment of the destination in bytes.
	/// \param isVol True if destination is volatile.
	/// \param DstPtrInfo IR information on the memory pointer.
	/// \returns New head in the control flow, if lowering was successful, empty
	/// SDValue otherwise.
	///
	/// The function tries to replace 'llvm.memset' intrinsic with several store
	/// operations and value calculation code. This is usually profitable for small
	/// memory size.
	static SDValue getMemsetStores(SelectionDAG &DAG, const SDLoc &dl,
	SDValue Chain, SDValue Dst, SDValue Src,
	uint64_t Size, unsigned Align, bool isVol,
	MachinePointerInfo DstPtrInfo) {
	// Turn a memset of undef to nop.
	if (Src.isUndef())
	return Chain;

	// Expand memset to a series of load/store ops if the size operand
	// falls below a certain threshold.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	std::vector<EVT> MemOps;
	bool DstAlignCanChange = false;
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	bool OptSize = shouldLowerMemFuncForSize(MF);
	FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst);
	if (FI && !MFI.isFixedObjectIndex(FI->getIndex()))
	DstAlignCanChange = true;
	bool IsZeroVal =
	isa<ConstantSDNode>(Src) && cast<ConstantSDNode>(Src)->isNullValue();
	if (!FindOptimalMemOpLowering(MemOps, TLI.getMaxStoresPerMemset(OptSize),
	Size, (DstAlignCanChange ? 0 : Align), 0,
	true, IsZeroVal, false, true,
	DstPtrInfo.getAddrSpace(), ~0u,
	DAG, TLI))
	return SDValue();

	if (DstAlignCanChange) {
	Type Ty = MemOps[0].getTypeForEVT(DAG.getContext());
	unsigned NewAlign = (unsigned)DAG.getDataLayout().getABITypeAlignment(Ty);
	if (NewAlign > Align) {
	// Give the stack frame object a larger alignment if needed.
	if (MFI.getObjectAlignment(FI->getIndex()) < NewAlign)
	MFI.setObjectAlignment(FI->getIndex(), NewAlign);
	Align = NewAlign;
	}
	}

	SmallVector<SDValue, 8> OutChains;
	uint64_t DstOff = 0;
	unsigned NumMemOps = MemOps.size();

	// Find the largest store and generate the bit pattern for it.
	EVT LargestVT = MemOps[0];
	for (unsigned i = 1; i < NumMemOps; i++)
	if (MemOps[i].bitsGT(LargestVT))
	LargestVT = MemOps[i];
	SDValue MemSetValue = getMemsetValue(Src, LargestVT, DAG, dl);

	for (unsigned i = 0; i < NumMemOps; i++) {
	EVT VT = MemOps[i];
	unsigned VTSize = VT.getSizeInBits() / 8;
	if (VTSize > Size) {
	// Issuing an unaligned load / store pair that overlaps with the previous
	// pair. Adjust the offset accordingly.
	assert(i == NumMemOps-1 && i != 0);
	DstOff -= VTSize - Size;
	}

	// If this store is smaller than the largest store see whether we can get
	// the smaller value for free with a truncate.
	SDValue Value = MemSetValue;
	if (VT.bitsLT(LargestVT)) {
	if (!LargestVT.isVector() && !VT.isVector() &&
	TLI.isTruncateFree(LargestVT, VT))
	Value = DAG.getNode(ISD::TRUNCATE, dl, VT, MemSetValue);
	else
	Value = getMemsetValue(Src, VT, DAG, dl);
	}
	assert(Value.getValueType() == VT && "Value with wrong type.");
	SDValue Store = DAG.getStore(
	Chain, dl, Value, DAG.getMemBasePlusOffset(Dst, DstOff, dl),
	DstPtrInfo.getWithOffset(DstOff), Align,
	isVol ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone);
	OutChains.push_back(Store);
	DstOff += VT.getSizeInBits() / 8;
	Size -= VTSize;
	}

	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
	}

	static void checkAddrSpaceIsValidForLibcall(const TargetLowering *TLI,
	unsigned AS) {
	// Lowering memcpy / memset / memmove intrinsics to calls is only valid if all
	// pointer operands can be losslessly bitcasted to pointers of address space 0
	if (AS != 0 && !TLI->isNoopAddrSpaceCast(AS, 0)) {
	report_fatal_error("cannot lower memory intrinsic in address space " +
	Twine(AS));
	}
	}

	SDValue SelectionDAG::getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst,
	SDValue Src, SDValue Size, unsigned Align,
	bool isVol, bool AlwaysInline, bool isTailCall,
	MachinePointerInfo DstPtrInfo,
	MachinePointerInfo SrcPtrInfo) {
	assert(Align && "The SDAG layer expects explicit alignment and reserves 0");

	// Check to see if we should lower the memcpy to loads and stores first.
	// For cases within the target-specified limits, this is the best choice.
	ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
	if (ConstantSize) {
	// Memcpy with size zero? Just return the original chain.
	if (ConstantSize->isNullValue())
	return Chain;

	SDValue Result = getMemcpyLoadsAndStores(*this, dl, Chain, Dst, Src,
	ConstantSize->getZExtValue(),Align,
	isVol, false, DstPtrInfo, SrcPtrInfo);
	if (Result.getNode())
	return Result;
	}

	// Then check to see if we should lower the memcpy with target-specific
	// code. If the target chooses to do this, this is the next best.
	if (TSI) {
	SDValue Result = TSI->EmitTargetCodeForMemcpy(
	*this, dl, Chain, Dst, Src, Size, Align, isVol, AlwaysInline,
	DstPtrInfo, SrcPtrInfo);
	if (Result.getNode())
	return Result;
	}

	// If we really need inline code and the target declined to provide it,
	// use a (potentially long) sequence of loads and stores.
	if (AlwaysInline) {
	assert(ConstantSize && "AlwaysInline requires a constant size!");
	return getMemcpyLoadsAndStores(*this, dl, Chain, Dst, Src,
	ConstantSize->getZExtValue(), Align, isVol,
	true, DstPtrInfo, SrcPtrInfo);
	}

	checkAddrSpaceIsValidForLibcall(TLI, DstPtrInfo.getAddrSpace());
	checkAddrSpaceIsValidForLibcall(TLI, SrcPtrInfo.getAddrSpace());

	// FIXME: If the memcpy is volatile (isVol), lowering it to a plain libc
	// memcpy is not guaranteed to be safe. libc memcpys aren't required to
	// respect volatile, so they may do things like read or write memory
	// beyond the given memory regions. But fixing this isn't easy, and most
	// people don't care.

	// Emit a library call.
	TargetLowering::ArgListTy Args;
	TargetLowering::ArgListEntry Entry;
	Entry.Ty = getDataLayout().getIntPtrType(*getContext());
	Entry.Node = Dst; Args.push_back(Entry);
	Entry.Node = Src; Args.push_back(Entry);
	Entry.Node = Size; Args.push_back(Entry);
	// FIXME: pass in SDLoc
	TargetLowering::CallLoweringInfo CLI(*this);
	CLI.setDebugLoc(dl)
	.setChain(Chain)
	.setLibCallee(TLI->getLibcallCallingConv(RTLIB::MEMCPY),
	Dst.getValueType().getTypeForEVT(*getContext()),
	getExternalSymbol(TLI->getLibcallName(RTLIB::MEMCPY),
	TLI->getPointerTy(getDataLayout())),
	std::move(Args))
	.setDiscardResult()
	.setTailCall(isTailCall);

	std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
	return CallResult.second;
	}

	SDValue SelectionDAG::getMemmove(SDValue Chain, const SDLoc &dl, SDValue Dst,
	SDValue Src, SDValue Size, unsigned Align,
	bool isVol, bool isTailCall,
	MachinePointerInfo DstPtrInfo,
	MachinePointerInfo SrcPtrInfo) {
	assert(Align && "The SDAG layer expects explicit alignment and reserves 0");

	// Check to see if we should lower the memmove to loads and stores first.
	// For cases within the target-specified limits, this is the best choice.
	ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
	if (ConstantSize) {
	// Memmove with size zero? Just return the original chain.
	if (ConstantSize->isNullValue())
	return Chain;

	SDValue Result =
	getMemmoveLoadsAndStores(*this, dl, Chain, Dst, Src,
	ConstantSize->getZExtValue(), Align, isVol,
	false, DstPtrInfo, SrcPtrInfo);
	if (Result.getNode())
	return Result;
	}

	// Then check to see if we should lower the memmove with target-specific
	// code. If the target chooses to do this, this is the next best.
	if (TSI) {
	SDValue Result = TSI->EmitTargetCodeForMemmove(
	*this, dl, Chain, Dst, Src, Size, Align, isVol, DstPtrInfo, SrcPtrInfo);
	if (Result.getNode())
	return Result;
	}

	checkAddrSpaceIsValidForLibcall(TLI, DstPtrInfo.getAddrSpace());
	checkAddrSpaceIsValidForLibcall(TLI, SrcPtrInfo.getAddrSpace());

	// FIXME: If the memmove is volatile, lowering it to plain libc memmove may
	// not be safe. See memcpy above for more details.

	// Emit a library call.
	TargetLowering::ArgListTy Args;
	TargetLowering::ArgListEntry Entry;
	Entry.Ty = getDataLayout().getIntPtrType(*getContext());
	Entry.Node = Dst; Args.push_back(Entry);
	Entry.Node = Src; Args.push_back(Entry);
	Entry.Node = Size; Args.push_back(Entry);
	// FIXME: pass in SDLoc
	TargetLowering::CallLoweringInfo CLI(*this);
	CLI.setDebugLoc(dl)
	.setChain(Chain)
	.setLibCallee(TLI->getLibcallCallingConv(RTLIB::MEMMOVE),
	Dst.getValueType().getTypeForEVT(*getContext()),
	getExternalSymbol(TLI->getLibcallName(RTLIB::MEMMOVE),
	TLI->getPointerTy(getDataLayout())),
	std::move(Args))
	.setDiscardResult()
	.setTailCall(isTailCall);

	std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
	return CallResult.second;
	}

	SDValue SelectionDAG::getMemset(SDValue Chain, const SDLoc &dl, SDValue Dst,
	SDValue Src, SDValue Size, unsigned Align,
	bool isVol, bool isTailCall,
	MachinePointerInfo DstPtrInfo) {
	assert(Align && "The SDAG layer expects explicit alignment and reserves 0");

	// Check to see if we should lower the memset to stores first.
	// For cases within the target-specified limits, this is the best choice.
	ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
	if (ConstantSize) {
	// Memset with size zero? Just return the original chain.
	if (ConstantSize->isNullValue())
	return Chain;

	SDValue Result =
	getMemsetStores(*this, dl, Chain, Dst, Src, ConstantSize->getZExtValue(),
	Align, isVol, DstPtrInfo);

	if (Result.getNode())
	return Result;
	}

	// Then check to see if we should lower the memset with target-specific
	// code. If the target chooses to do this, this is the next best.
	if (TSI) {
	SDValue Result = TSI->EmitTargetCodeForMemset(
	*this, dl, Chain, Dst, Src, Size, Align, isVol, DstPtrInfo);
	if (Result.getNode())
	return Result;
	}

	checkAddrSpaceIsValidForLibcall(TLI, DstPtrInfo.getAddrSpace());

	// Emit a library call.
	Type IntPtrTy = getDataLayout().getIntPtrType(getContext());
	TargetLowering::ArgListTy Args;
	TargetLowering::ArgListEntry Entry;
	Entry.Node = Dst; Entry.Ty = IntPtrTy;
	Args.push_back(Entry);
	Entry.Node = Src;
	Entry.Ty = Src.getValueType().getTypeForEVT(*getContext());
	Args.push_back(Entry);
	Entry.Node = Size;
	Entry.Ty = IntPtrTy;
	Args.push_back(Entry);

	// FIXME: pass in SDLoc
	TargetLowering::CallLoweringInfo CLI(*this);
	CLI.setDebugLoc(dl)
	.setChain(Chain)
	.setLibCallee(TLI->getLibcallCallingConv(RTLIB::MEMSET),
	Dst.getValueType().getTypeForEVT(*getContext()),
	getExternalSymbol(TLI->getLibcallName(RTLIB::MEMSET),
	TLI->getPointerTy(getDataLayout())),
	std::move(Args))
	.setDiscardResult()
	.setTailCall(isTailCall);

	std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
	return CallResult.second;
	}

	SDValue SelectionDAG::getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT,
	SDVTList VTList, ArrayRef<SDValue> Ops,
	MachineMemOperand *MMO) {
	FoldingSetNodeID ID;
	ID.AddInteger(MemVT.getRawBits());
	AddNodeIDNode(ID, Opcode, VTList, Ops);
	ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
	void* IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
	cast<AtomicSDNode>(E)->refineAlignment(MMO);
	return SDValue(E, 0);
	}

	auto *N = newSDNode<AtomicSDNode>(Opcode, dl.getIROrder(), dl.getDebugLoc(),
	VTList, MemVT, MMO);
	createOperands(N, Ops);

	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getAtomicCmpSwap(
	unsigned Opcode, const SDLoc &dl, EVT MemVT, SDVTList VTs, SDValue Chain,
	SDValue Ptr, SDValue Cmp, SDValue Swp, MachinePointerInfo PtrInfo,
	unsigned Alignment, AtomicOrdering SuccessOrdering,
	AtomicOrdering FailureOrdering, SyncScope::ID SSID) {
	assert(Opcode == ISD::ATOMIC_CMP_SWAP \|\|
	Opcode == ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
	assert(Cmp.getValueType() == Swp.getValueType() && "Invalid Atomic Op Types");

	if (Alignment == 0) // Ensure that codegen never sees alignment 0
	Alignment = getEVTAlignment(MemVT);

	MachineFunction &MF = getMachineFunction();

	// FIXME: Volatile isn't really correct; we should keep track of atomic
	// orderings in the memoperand.
	auto Flags = MachineMemOperand::MOVolatile \| MachineMemOperand::MOLoad \|
	MachineMemOperand::MOStore;
	MachineMemOperand *MMO =
	MF.getMachineMemOperand(PtrInfo, Flags, MemVT.getStoreSize(), Alignment,
	AAMDNodes(), nullptr, SSID, SuccessOrdering,
	FailureOrdering);

	return getAtomicCmpSwap(Opcode, dl, MemVT, VTs, Chain, Ptr, Cmp, Swp, MMO);
	}

	SDValue SelectionDAG::getAtomicCmpSwap(unsigned Opcode, const SDLoc &dl,
	EVT MemVT, SDVTList VTs, SDValue Chain,
	SDValue Ptr, SDValue Cmp, SDValue Swp,
	MachineMemOperand *MMO) {
	assert(Opcode == ISD::ATOMIC_CMP_SWAP \|\|
	Opcode == ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
	assert(Cmp.getValueType() == Swp.getValueType() && "Invalid Atomic Op Types");

	SDValue Ops[] = {Chain, Ptr, Cmp, Swp};
	return getAtomic(Opcode, dl, MemVT, VTs, Ops, MMO);
	}

	SDValue SelectionDAG::getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT,
	SDValue Chain, SDValue Ptr, SDValue Val,
	const Value *PtrVal, unsigned Alignment,
	AtomicOrdering Ordering,
	SyncScope::ID SSID) {
	if (Alignment == 0) // Ensure that codegen never sees alignment 0
	Alignment = getEVTAlignment(MemVT);

	MachineFunction &MF = getMachineFunction();
	// An atomic store does not load. An atomic load does not store.
	// (An atomicrmw obviously both loads and stores.)
	// For now, atomics are considered to be volatile always, and they are
	// chained as such.
	// FIXME: Volatile isn't really correct; we should keep track of atomic
	// orderings in the memoperand.
	auto Flags = MachineMemOperand::MOVolatile;
	if (Opcode != ISD::ATOMIC_STORE)
	Flags \|= MachineMemOperand::MOLoad;
	if (Opcode != ISD::ATOMIC_LOAD)
	Flags \|= MachineMemOperand::MOStore;

	MachineMemOperand *MMO =
	MF.getMachineMemOperand(MachinePointerInfo(PtrVal), Flags,
	MemVT.getStoreSize(), Alignment, AAMDNodes(),
	nullptr, SSID, Ordering);

	return getAtomic(Opcode, dl, MemVT, Chain, Ptr, Val, MMO);
	}

	SDValue SelectionDAG::getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT,
	SDValue Chain, SDValue Ptr, SDValue Val,
	MachineMemOperand *MMO) {
	assert((Opcode == ISD::ATOMIC_LOAD_ADD \|\|
	Opcode == ISD::ATOMIC_LOAD_SUB \|\|
	Opcode == ISD::ATOMIC_LOAD_AND \|\|
	Opcode == ISD::ATOMIC_LOAD_OR \|\|
	Opcode == ISD::ATOMIC_LOAD_XOR \|\|
	Opcode == ISD::ATOMIC_LOAD_NAND \|\|
	Opcode == ISD::ATOMIC_LOAD_MIN \|\|
	Opcode == ISD::ATOMIC_LOAD_MAX \|\|
	Opcode == ISD::ATOMIC_LOAD_UMIN \|\|
	Opcode == ISD::ATOMIC_LOAD_UMAX \|\|
	Opcode == ISD::ATOMIC_SWAP \|\|
	Opcode == ISD::ATOMIC_STORE) &&
	"Invalid Atomic Op");

	EVT VT = Val.getValueType();

	SDVTList VTs = Opcode == ISD::ATOMIC_STORE ? getVTList(MVT::Other) :
	getVTList(VT, MVT::Other);
	SDValue Ops[] = {Chain, Ptr, Val};
	return getAtomic(Opcode, dl, MemVT, VTs, Ops, MMO);
	}

	SDValue SelectionDAG::getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT,
	EVT VT, SDValue Chain, SDValue Ptr,
	MachineMemOperand *MMO) {
	assert(Opcode == ISD::ATOMIC_LOAD && "Invalid Atomic Op");

	SDVTList VTs = getVTList(VT, MVT::Other);
	SDValue Ops[] = {Chain, Ptr};
	return getAtomic(Opcode, dl, MemVT, VTs, Ops, MMO);
	}

	/// getMergeValues - Create a MERGE_VALUES node from the given operands.
	SDValue SelectionDAG::getMergeValues(ArrayRef<SDValue> Ops, const SDLoc &dl) {
	if (Ops.size() == 1)
	return Ops[0];

	SmallVector<EVT, 4> VTs;
	VTs.reserve(Ops.size());
	for (unsigned i = 0; i < Ops.size(); ++i)
	VTs.push_back(Ops[i].getValueType());
	return getNode(ISD::MERGE_VALUES, dl, getVTList(VTs), Ops);
	}

	SDValue SelectionDAG::getMemIntrinsicNode(
	unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef<SDValue> Ops,
	EVT MemVT, MachinePointerInfo PtrInfo, unsigned Align, bool Vol,
	bool ReadMem, bool WriteMem, unsigned Size) {
	if (Align == 0) // Ensure that codegen never sees alignment 0
	Align = getEVTAlignment(MemVT);

	MachineFunction &MF = getMachineFunction();
	auto Flags = MachineMemOperand::MONone;
	if (WriteMem)
	Flags \|= MachineMemOperand::MOStore;
	if (ReadMem)
	Flags \|= MachineMemOperand::MOLoad;
	if (Vol)
	Flags \|= MachineMemOperand::MOVolatile;
	if (!Size)
	Size = MemVT.getStoreSize();
	MachineMemOperand *MMO =
	MF.getMachineMemOperand(PtrInfo, Flags, Size, Align);

	return getMemIntrinsicNode(Opcode, dl, VTList, Ops, MemVT, MMO);
	}

	SDValue SelectionDAG::getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl,
	SDVTList VTList,
	ArrayRef<SDValue> Ops, EVT MemVT,
	MachineMemOperand *MMO) {
	assert((Opcode == ISD::INTRINSIC_VOID \|\|
	Opcode == ISD::INTRINSIC_W_CHAIN \|\|
	Opcode == ISD::PREFETCH \|\|
	Opcode == ISD::LIFETIME_START \|\|
	Opcode == ISD::LIFETIME_END \|\|
	((int)Opcode <= std::numeric_limits<int>::max() &&
	(int)Opcode >= ISD::FIRST_TARGET_MEMORY_OPCODE)) &&
	"Opcode is not a memory-accessing opcode!");

	// Memoize the node unless it returns a flag.
	MemIntrinsicSDNode *N;
	if (VTList.VTs[VTList.NumVTs-1] != MVT::Glue) {
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, Opcode, VTList, Ops);
	ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
	cast<MemIntrinsicSDNode>(E)->refineAlignment(MMO);
	return SDValue(E, 0);
	}

	N = newSDNode<MemIntrinsicSDNode>(Opcode, dl.getIROrder(), dl.getDebugLoc(),
	VTList, MemVT, MMO);
	createOperands(N, Ops);

	CSEMap.InsertNode(N, IP);
	} else {
	N = newSDNode<MemIntrinsicSDNode>(Opcode, dl.getIROrder(), dl.getDebugLoc(),
	VTList, MemVT, MMO);
	createOperands(N, Ops);
	}
	InsertNode(N);
	return SDValue(N, 0);
	}

	/// InferPointerInfo - If the specified ptr/offset is a frame index, infer a
	/// MachinePointerInfo record from it. This is particularly useful because the
	/// code generator has many cases where it doesn't bother passing in a
	/// MachinePointerInfo to getLoad or getStore when it has "FI+Cst".
	static MachinePointerInfo InferPointerInfo(SelectionDAG &DAG, SDValue Ptr,
	int64_t Offset = 0) {
	// If this is FI+Offset, we can model it.
	if (const FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Ptr))
	return MachinePointerInfo::getFixedStack(DAG.getMachineFunction(),
	FI->getIndex(), Offset);

	// If this is (FI+Offset1)+Offset2, we can model it.
	if (Ptr.getOpcode() != ISD::ADD \|\|
	!isa<ConstantSDNode>(Ptr.getOperand(1)) \|\|
	!isa<FrameIndexSDNode>(Ptr.getOperand(0)))
	return MachinePointerInfo();

	int FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
	return MachinePointerInfo::getFixedStack(
	DAG.getMachineFunction(), FI,
	Offset + cast<ConstantSDNode>(Ptr.getOperand(1))->getSExtValue());
	}

	/// InferPointerInfo - If the specified ptr/offset is a frame index, infer a
	/// MachinePointerInfo record from it. This is particularly useful because the
	/// code generator has many cases where it doesn't bother passing in a
	/// MachinePointerInfo to getLoad or getStore when it has "FI+Cst".
	static MachinePointerInfo InferPointerInfo(SelectionDAG &DAG, SDValue Ptr,
	SDValue OffsetOp) {
	// If the 'Offset' value isn't a constant, we can't handle this.
	if (ConstantSDNode *OffsetNode = dyn_cast<ConstantSDNode>(OffsetOp))
	return InferPointerInfo(DAG, Ptr, OffsetNode->getSExtValue());
	if (OffsetOp.isUndef())
	return InferPointerInfo(DAG, Ptr);
	return MachinePointerInfo();
	}

	SDValue SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
	EVT VT, const SDLoc &dl, SDValue Chain,
	SDValue Ptr, SDValue Offset,
	MachinePointerInfo PtrInfo, EVT MemVT,
	unsigned Alignment,
	MachineMemOperand::Flags MMOFlags,
	const AAMDNodes &AAInfo, const MDNode *Ranges) {
	assert(Chain.getValueType() == MVT::Other &&
	"Invalid chain type");
	if (Alignment == 0) // Ensure that codegen never sees alignment 0
	Alignment = getEVTAlignment(MemVT);

	MMOFlags \|= MachineMemOperand::MOLoad;
	assert((MMOFlags & MachineMemOperand::MOStore) == 0);
	// If we don't have a PtrInfo, infer the trivial frame index case to simplify
	// clients.
	if (PtrInfo.V.isNull())
	PtrInfo = InferPointerInfo(*this, Ptr, Offset);

	MachineFunction &MF = getMachineFunction();
	MachineMemOperand *MMO = MF.getMachineMemOperand(
	PtrInfo, MMOFlags, MemVT.getStoreSize(), Alignment, AAInfo, Ranges);
	return getLoad(AM, ExtType, VT, dl, Chain, Ptr, Offset, MemVT, MMO);
	}

	SDValue SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
	EVT VT, const SDLoc &dl, SDValue Chain,
	SDValue Ptr, SDValue Offset, EVT MemVT,
	MachineMemOperand *MMO) {
	if (VT == MemVT) {
	ExtType = ISD::NON_EXTLOAD;
	} else if (ExtType == ISD::NON_EXTLOAD) {
	assert(VT == MemVT && "Non-extending load from different memory type!");
	} else {
	// Extending load.
	assert(MemVT.getScalarType().bitsLT(VT.getScalarType()) &&
	"Should only be an extending load, not truncating!");
	assert(VT.isInteger() == MemVT.isInteger() &&
	"Cannot convert from FP to Int or Int -> FP!");
	assert(VT.isVector() == MemVT.isVector() &&
	"Cannot use an ext load to convert to or from a vector!");
	assert((!VT.isVector() \|\|
	VT.getVectorNumElements() == MemVT.getVectorNumElements()) &&
	"Cannot use an ext load to change the number of vector elements!");
	}

	bool Indexed = AM != ISD::UNINDEXED;
	assert((Indexed \|\| Offset.isUndef()) && "Unindexed load with an offset!");

	SDVTList VTs = Indexed ?
	getVTList(VT, Ptr.getValueType(), MVT::Other) : getVTList(VT, MVT::Other);
	SDValue Ops[] = { Chain, Ptr, Offset };
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, ISD::LOAD, VTs, Ops);
	ID.AddInteger(MemVT.getRawBits());
	ID.AddInteger(getSyntheticNodeSubclassData<LoadSDNode>(
	dl.getIROrder(), VTs, AM, ExtType, MemVT, MMO));
	ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
	cast<LoadSDNode>(E)->refineAlignment(MMO);
	return SDValue(E, 0);
	}
	auto *N = newSDNode<LoadSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs, AM,
	ExtType, MemVT, MMO);
	createOperands(N, Ops);

	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getLoad(EVT VT, const SDLoc &dl, SDValue Chain,
	SDValue Ptr, MachinePointerInfo PtrInfo,
	unsigned Alignment,
	MachineMemOperand::Flags MMOFlags,
	const AAMDNodes &AAInfo, const MDNode *Ranges) {
	SDValue Undef = getUNDEF(Ptr.getValueType());
	return getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD, VT, dl, Chain, Ptr, Undef,
	PtrInfo, VT, Alignment, MMOFlags, AAInfo, Ranges);
	}

	SDValue SelectionDAG::getLoad(EVT VT, const SDLoc &dl, SDValue Chain,
	SDValue Ptr, MachineMemOperand *MMO) {
	SDValue Undef = getUNDEF(Ptr.getValueType());
	return getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD, VT, dl, Chain, Ptr, Undef,
	VT, MMO);
	}

	SDValue SelectionDAG::getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl,
	EVT VT, SDValue Chain, SDValue Ptr,
	MachinePointerInfo PtrInfo, EVT MemVT,
	unsigned Alignment,
	MachineMemOperand::Flags MMOFlags,
	const AAMDNodes &AAInfo) {
	SDValue Undef = getUNDEF(Ptr.getValueType());
	return getLoad(ISD::UNINDEXED, ExtType, VT, dl, Chain, Ptr, Undef, PtrInfo,
	MemVT, Alignment, MMOFlags, AAInfo);
	}

	SDValue SelectionDAG::getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl,
	EVT VT, SDValue Chain, SDValue Ptr, EVT MemVT,
	MachineMemOperand *MMO) {
	SDValue Undef = getUNDEF(Ptr.getValueType());
	return getLoad(ISD::UNINDEXED, ExtType, VT, dl, Chain, Ptr, Undef,
	MemVT, MMO);
	}

	SDValue SelectionDAG::getIndexedLoad(SDValue OrigLoad, const SDLoc &dl,
	SDValue Base, SDValue Offset,
	ISD::MemIndexedMode AM) {
	LoadSDNode *LD = cast<LoadSDNode>(OrigLoad);
	assert(LD->getOffset().isUndef() && "Load is already a indexed load!");
	// Don't propagate the invariant or dereferenceable flags.
	auto MMOFlags =
	LD->getMemOperand()->getFlags() &
	~(MachineMemOperand::MOInvariant \| MachineMemOperand::MODereferenceable);
	return getLoad(AM, LD->getExtensionType(), OrigLoad.getValueType(), dl,
	LD->getChain(), Base, Offset, LD->getPointerInfo(),
	LD->getMemoryVT(), LD->getAlignment(), MMOFlags,
	LD->getAAInfo());
	}

	SDValue SelectionDAG::getStore(SDValue Chain, const SDLoc &dl, SDValue Val,
	SDValue Ptr, MachinePointerInfo PtrInfo,
	unsigned Alignment,
	MachineMemOperand::Flags MMOFlags,
	const AAMDNodes &AAInfo) {
	assert(Chain.getValueType() == MVT::Other && "Invalid chain type");
	if (Alignment == 0) // Ensure that codegen never sees alignment 0
	Alignment = getEVTAlignment(Val.getValueType());

	MMOFlags \|= MachineMemOperand::MOStore;
	assert((MMOFlags & MachineMemOperand::MOLoad) == 0);

	if (PtrInfo.V.isNull())
	PtrInfo = InferPointerInfo(*this, Ptr);

	MachineFunction &MF = getMachineFunction();
	MachineMemOperand *MMO = MF.getMachineMemOperand(
	PtrInfo, MMOFlags, Val.getValueType().getStoreSize(), Alignment, AAInfo);
	return getStore(Chain, dl, Val, Ptr, MMO);
	}

	SDValue SelectionDAG::getStore(SDValue Chain, const SDLoc &dl, SDValue Val,
	SDValue Ptr, MachineMemOperand *MMO) {
	assert(Chain.getValueType() == MVT::Other &&
	"Invalid chain type");
	EVT VT = Val.getValueType();
	SDVTList VTs = getVTList(MVT::Other);
	SDValue Undef = getUNDEF(Ptr.getValueType());
	SDValue Ops[] = { Chain, Val, Ptr, Undef };
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, ISD::STORE, VTs, Ops);
	ID.AddInteger(VT.getRawBits());
	ID.AddInteger(getSyntheticNodeSubclassData<StoreSDNode>(
	dl.getIROrder(), VTs, ISD::UNINDEXED, false, VT, MMO));
	ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
	cast<StoreSDNode>(E)->refineAlignment(MMO);
	return SDValue(E, 0);
	}
	auto *N = newSDNode<StoreSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs,
	ISD::UNINDEXED, false, VT, MMO);
	createOperands(N, Ops);

	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val,
	SDValue Ptr, MachinePointerInfo PtrInfo,
	EVT SVT, unsigned Alignment,
	MachineMemOperand::Flags MMOFlags,
	const AAMDNodes &AAInfo) {
	assert(Chain.getValueType() == MVT::Other &&
	"Invalid chain type");
	if (Alignment == 0) // Ensure that codegen never sees alignment 0
	Alignment = getEVTAlignment(SVT);

	MMOFlags \|= MachineMemOperand::MOStore;
	assert((MMOFlags & MachineMemOperand::MOLoad) == 0);

	if (PtrInfo.V.isNull())
	PtrInfo = InferPointerInfo(*this, Ptr);

	MachineFunction &MF = getMachineFunction();
	MachineMemOperand *MMO = MF.getMachineMemOperand(
	PtrInfo, MMOFlags, SVT.getStoreSize(), Alignment, AAInfo);
	return getTruncStore(Chain, dl, Val, Ptr, SVT, MMO);
	}

	SDValue SelectionDAG::getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val,
	SDValue Ptr, EVT SVT,
	MachineMemOperand *MMO) {
	EVT VT = Val.getValueType();

	assert(Chain.getValueType() == MVT::Other &&
	"Invalid chain type");
	if (VT == SVT)
	return getStore(Chain, dl, Val, Ptr, MMO);

	assert(SVT.getScalarType().bitsLT(VT.getScalarType()) &&
	"Should only be a truncating store, not extending!");
	assert(VT.isInteger() == SVT.isInteger() &&
	"Can't do FP-INT conversion!");
	assert(VT.isVector() == SVT.isVector() &&
	"Cannot use trunc store to convert to or from a vector!");
	assert((!VT.isVector() \|\|
	VT.getVectorNumElements() == SVT.getVectorNumElements()) &&
	"Cannot use trunc store to change the number of vector elements!");

	SDVTList VTs = getVTList(MVT::Other);
	SDValue Undef = getUNDEF(Ptr.getValueType());
	SDValue Ops[] = { Chain, Val, Ptr, Undef };
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, ISD::STORE, VTs, Ops);
	ID.AddInteger(SVT.getRawBits());
	ID.AddInteger(getSyntheticNodeSubclassData<StoreSDNode>(
	dl.getIROrder(), VTs, ISD::UNINDEXED, true, SVT, MMO));
	ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
	cast<StoreSDNode>(E)->refineAlignment(MMO);
	return SDValue(E, 0);
	}
	auto *N = newSDNode<StoreSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs,
	ISD::UNINDEXED, true, SVT, MMO);
	createOperands(N, Ops);

	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getIndexedStore(SDValue OrigStore, const SDLoc &dl,
	SDValue Base, SDValue Offset,
	ISD::MemIndexedMode AM) {
	StoreSDNode *ST = cast<StoreSDNode>(OrigStore);
	assert(ST->getOffset().isUndef() && "Store is already a indexed store!");
	SDVTList VTs = getVTList(Base.getValueType(), MVT::Other);
	SDValue Ops[] = { ST->getChain(), ST->getValue(), Base, Offset };
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, ISD::STORE, VTs, Ops);
	ID.AddInteger(ST->getMemoryVT().getRawBits());
	ID.AddInteger(ST->getRawSubclassData());
	ID.AddInteger(ST->getPointerInfo().getAddrSpace());
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP))
	return SDValue(E, 0);

	auto *N = newSDNode<StoreSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs, AM,
	ST->isTruncatingStore(), ST->getMemoryVT(),
	ST->getMemOperand());
	createOperands(N, Ops);

	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain,
	SDValue Ptr, SDValue Mask, SDValue Src0,
	EVT MemVT, MachineMemOperand *MMO,
	ISD::LoadExtType ExtTy, bool isExpanding) {
	SDVTList VTs = getVTList(VT, MVT::Other);
	SDValue Ops[] = { Chain, Ptr, Mask, Src0 };
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, ISD::MLOAD, VTs, Ops);
	ID.AddInteger(VT.getRawBits());
	ID.AddInteger(getSyntheticNodeSubclassData<MaskedLoadSDNode>(
	dl.getIROrder(), VTs, ExtTy, isExpanding, MemVT, MMO));
	ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
	cast<MaskedLoadSDNode>(E)->refineAlignment(MMO);
	return SDValue(E, 0);
	}
	auto *N = newSDNode<MaskedLoadSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs,
	ExtTy, isExpanding, MemVT, MMO);
	createOperands(N, Ops);

	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getMaskedStore(SDValue Chain, const SDLoc &dl,
	SDValue Val, SDValue Ptr, SDValue Mask,
	EVT MemVT, MachineMemOperand *MMO,
	bool IsTruncating, bool IsCompressing) {
	assert(Chain.getValueType() == MVT::Other &&
	"Invalid chain type");
	EVT VT = Val.getValueType();
	SDVTList VTs = getVTList(MVT::Other);
	SDValue Ops[] = { Chain, Ptr, Mask, Val };
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, ISD::MSTORE, VTs, Ops);
	ID.AddInteger(VT.getRawBits());
	ID.AddInteger(getSyntheticNodeSubclassData<MaskedStoreSDNode>(
	dl.getIROrder(), VTs, IsTruncating, IsCompressing, MemVT, MMO));
	ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
	cast<MaskedStoreSDNode>(E)->refineAlignment(MMO);
	return SDValue(E, 0);
	}
	auto *N = newSDNode<MaskedStoreSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs,
	IsTruncating, IsCompressing, MemVT, MMO);
	createOperands(N, Ops);

	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getMaskedGather(SDVTList VTs, EVT VT, const SDLoc &dl,
	ArrayRef<SDValue> Ops,
	MachineMemOperand *MMO) {
	assert(Ops.size() == 5 && "Incompatible number of operands");

	FoldingSetNodeID ID;
	AddNodeIDNode(ID, ISD::MGATHER, VTs, Ops);
	ID.AddInteger(VT.getRawBits());
	ID.AddInteger(getSyntheticNodeSubclassData<MaskedGatherSDNode>(
	dl.getIROrder(), VTs, VT, MMO));
	ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
	cast<MaskedGatherSDNode>(E)->refineAlignment(MMO);
	return SDValue(E, 0);
	}

	auto *N = newSDNode<MaskedGatherSDNode>(dl.getIROrder(), dl.getDebugLoc(),
	VTs, VT, MMO);
	createOperands(N, Ops);

	assert(N->getValue().getValueType() == N->getValueType(0) &&
	"Incompatible type of the PassThru value in MaskedGatherSDNode");
	assert(N->getMask().getValueType().getVectorNumElements() ==
	N->getValueType(0).getVectorNumElements() &&
	"Vector width mismatch between mask and data");
	assert(N->getIndex().getValueType().getVectorNumElements() ==
	N->getValueType(0).getVectorNumElements() &&
	"Vector width mismatch between index and data");

	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getMaskedScatter(SDVTList VTs, EVT VT, const SDLoc &dl,
	ArrayRef<SDValue> Ops,
	MachineMemOperand *MMO) {
	assert(Ops.size() == 5 && "Incompatible number of operands");

	FoldingSetNodeID ID;
	AddNodeIDNode(ID, ISD::MSCATTER, VTs, Ops);
	ID.AddInteger(VT.getRawBits());
	ID.AddInteger(getSyntheticNodeSubclassData<MaskedScatterSDNode>(
	dl.getIROrder(), VTs, VT, MMO));
	ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
	cast<MaskedScatterSDNode>(E)->refineAlignment(MMO);
	return SDValue(E, 0);
	}
	auto *N = newSDNode<MaskedScatterSDNode>(dl.getIROrder(), dl.getDebugLoc(),
	VTs, VT, MMO);
	createOperands(N, Ops);

	assert(N->getMask().getValueType().getVectorNumElements() ==
	N->getValue().getValueType().getVectorNumElements() &&
	"Vector width mismatch between mask and data");
	assert(N->getIndex().getValueType().getVectorNumElements() ==
	N->getValue().getValueType().getVectorNumElements() &&
	"Vector width mismatch between index and data");

	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getVAArg(EVT VT, const SDLoc &dl, SDValue Chain,
	SDValue Ptr, SDValue SV, unsigned Align) {
	SDValue Ops[] = { Chain, Ptr, SV, getTargetConstant(Align, dl, MVT::i32) };
	return getNode(ISD::VAARG, dl, getVTList(VT, MVT::Other), Ops);
	}

	SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
	ArrayRef<SDUse> Ops) {
	switch (Ops.size()) {
	case 0: return getNode(Opcode, DL, VT);
	case 1: return getNode(Opcode, DL, VT, static_cast<const SDValue>(Ops[0]));
	case 2: return getNode(Opcode, DL, VT, Ops[0], Ops[1]);
	case 3: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Ops[2]);
	default: break;
	}

	// Copy from an SDUse array into an SDValue array for use with
	// the regular getNode logic.
	SmallVector<SDValue, 8> NewOps(Ops.begin(), Ops.end());
	return getNode(Opcode, DL, VT, NewOps);
	}

	SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
	ArrayRef<SDValue> Ops, const SDNodeFlags Flags) {
	unsigned NumOps = Ops.size();
	switch (NumOps) {
	case 0: return getNode(Opcode, DL, VT);
	case 1: return getNode(Opcode, DL, VT, Ops[0], Flags);
	case 2: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Flags);
	case 3: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Ops[2]);
	default: break;
	}

	switch (Opcode) {
	default: break;
	case ISD::CONCAT_VECTORS:
	// Attempt to fold CONCAT_VECTORS into BUILD_VECTOR or UNDEF.
	if (SDValue V = FoldCONCAT_VECTORS(DL, VT, Ops, *this))
	return V;
	break;
	case ISD::SELECT_CC:
	assert(NumOps == 5 && "SELECT_CC takes 5 operands!");
	assert(Ops[0].getValueType() == Ops[1].getValueType() &&
	"LHS and RHS of condition must have same type!");
	assert(Ops[2].getValueType() == Ops[3].getValueType() &&
	"True and False arms of SelectCC must have same type!");
	assert(Ops[2].getValueType() == VT &&
	"select_cc node must be of same type as true and false value!");
	break;
	case ISD::BR_CC:
	assert(NumOps == 5 && "BR_CC takes 5 operands!");
	assert(Ops[2].getValueType() == Ops[3].getValueType() &&
	"LHS/RHS of comparison should match types!");
	break;
	}

	// Memoize nodes.
	SDNode *N;
	SDVTList VTs = getVTList(VT);

	if (VT != MVT::Glue) {
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, Opcode, VTs, Ops);
	void *IP = nullptr;

	if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP))
	return SDValue(E, 0);

	N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
	createOperands(N, Ops);

	CSEMap.InsertNode(N, IP);
	} else {
	N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
	createOperands(N, Ops);
	}

	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL,
	ArrayRef<EVT> ResultTys, ArrayRef<SDValue> Ops) {
	return getNode(Opcode, DL, getVTList(ResultTys), Ops);
	}

	SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
	ArrayRef<SDValue> Ops) {
	if (VTList.NumVTs == 1)
	return getNode(Opcode, DL, VTList.VTs[0], Ops);

	#if 0
	switch (Opcode) {
	// FIXME: figure out how to safely handle things like
	// int foo(int x) { return 1 << (x & 255); }
	// int bar() { return foo(256); }
	case ISD::SRA_PARTS:
	case ISD::SRL_PARTS:
	case ISD::SHL_PARTS:
	if (N3.getOpcode() == ISD::SIGN_EXTEND_INREG &&
	cast<VTSDNode>(N3.getOperand(1))->getVT() != MVT::i1)
	return getNode(Opcode, DL, VT, N1, N2, N3.getOperand(0));
	else if (N3.getOpcode() == ISD::AND)
	if (ConstantSDNode *AndRHS = dyn_cast<ConstantSDNode>(N3.getOperand(1))) {
	// If the and is only masking out bits that cannot effect the shift,
	// eliminate the and.
	unsigned NumBits = VT.getScalarSizeInBits()*2;
	if ((AndRHS->getValue() & (NumBits-1)) == NumBits-1)
	return getNode(Opcode, DL, VT, N1, N2, N3.getOperand(0));
	}
	break;
	}
	#endif

	// Memoize the node unless it returns a flag.
	SDNode *N;
	if (VTList.VTs[VTList.NumVTs-1] != MVT::Glue) {
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, Opcode, VTList, Ops);
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP))
	return SDValue(E, 0);

	N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTList);
	createOperands(N, Ops);
	CSEMap.InsertNode(N, IP);
	} else {
	N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTList);
	createOperands(N, Ops);
	}
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL,
	SDVTList VTList) {
	return getNode(Opcode, DL, VTList, None);
	}

	SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
	SDValue N1) {
	SDValue Ops[] = { N1 };
	return getNode(Opcode, DL, VTList, Ops);
	}

	SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
	SDValue N1, SDValue N2) {
	SDValue Ops[] = { N1, N2 };
	return getNode(Opcode, DL, VTList, Ops);
	}

	SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
	SDValue N1, SDValue N2, SDValue N3) {
	SDValue Ops[] = { N1, N2, N3 };
	return getNode(Opcode, DL, VTList, Ops);
	}

	SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
	SDValue N1, SDValue N2, SDValue N3, SDValue N4) {
	SDValue Ops[] = { N1, N2, N3, N4 };
	return getNode(Opcode, DL, VTList, Ops);
	}

	SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
	SDValue N1, SDValue N2, SDValue N3, SDValue N4,
	SDValue N5) {
	SDValue Ops[] = { N1, N2, N3, N4, N5 };
	return getNode(Opcode, DL, VTList, Ops);
	}

	SDVTList SelectionDAG::getVTList(EVT VT) {
	return makeVTList(SDNode::getValueTypeList(VT), 1);
	}

	SDVTList SelectionDAG::getVTList(EVT VT1, EVT VT2) {
	FoldingSetNodeID ID;
	ID.AddInteger(2U);
	ID.AddInteger(VT1.getRawBits());
	ID.AddInteger(VT2.getRawBits());

	void *IP = nullptr;
	SDVTListNode *Result = VTListMap.FindNodeOrInsertPos(ID, IP);
	if (!Result) {
	EVT *Array = Allocator.Allocate<EVT>(2);
	Array[0] = VT1;
	Array[1] = VT2;
	Result = new (Allocator) SDVTListNode(ID.Intern(Allocator), Array, 2);
	VTListMap.InsertNode(Result, IP);
	}
	return Result->getSDVTList();
	}

	SDVTList SelectionDAG::getVTList(EVT VT1, EVT VT2, EVT VT3) {
	FoldingSetNodeID ID;
	ID.AddInteger(3U);
	ID.AddInteger(VT1.getRawBits());
	ID.AddInteger(VT2.getRawBits());
	ID.AddInteger(VT3.getRawBits());

	void *IP = nullptr;
	SDVTListNode *Result = VTListMap.FindNodeOrInsertPos(ID, IP);
	if (!Result) {
	EVT *Array = Allocator.Allocate<EVT>(3);
	Array[0] = VT1;
	Array[1] = VT2;
	Array[2] = VT3;
	Result = new (Allocator) SDVTListNode(ID.Intern(Allocator), Array, 3);
	VTListMap.InsertNode(Result, IP);
	}
	return Result->getSDVTList();
	}

	SDVTList SelectionDAG::getVTList(EVT VT1, EVT VT2, EVT VT3, EVT VT4) {
	FoldingSetNodeID ID;
	ID.AddInteger(4U);
	ID.AddInteger(VT1.getRawBits());
	ID.AddInteger(VT2.getRawBits());
	ID.AddInteger(VT3.getRawBits());
	ID.AddInteger(VT4.getRawBits());

	void *IP = nullptr;
	SDVTListNode *Result = VTListMap.FindNodeOrInsertPos(ID, IP);
	if (!Result) {
	EVT *Array = Allocator.Allocate<EVT>(4);
	Array[0] = VT1;
	Array[1] = VT2;
	Array[2] = VT3;
	Array[3] = VT4;
	Result = new (Allocator) SDVTListNode(ID.Intern(Allocator), Array, 4);
	VTListMap.InsertNode(Result, IP);
	}
	return Result->getSDVTList();
	}

	SDVTList SelectionDAG::getVTList(ArrayRef<EVT> VTs) {
	unsigned NumVTs = VTs.size();
	FoldingSetNodeID ID;
	ID.AddInteger(NumVTs);
	for (unsigned index = 0; index < NumVTs; index++) {
	ID.AddInteger(VTs[index].getRawBits());
	}

	void *IP = nullptr;
	SDVTListNode *Result = VTListMap.FindNodeOrInsertPos(ID, IP);
	if (!Result) {
	EVT *Array = Allocator.Allocate<EVT>(NumVTs);
	std::copy(VTs.begin(), VTs.end(), Array);
	Result = new (Allocator) SDVTListNode(ID.Intern(Allocator), Array, NumVTs);
	VTListMap.InsertNode(Result, IP);
	}
	return Result->getSDVTList();
	}


	/// UpdateNodeOperands - Mutate the specified node in-place to have the
	/// specified operands. If the resultant node already exists in the DAG,
	/// this does not modify the specified node, instead it returns the node that
	/// already exists. If the resultant node does not exist in the DAG, the
	/// input node is returned. As a degenerate case, if you specify the same
	/// input operands as the node already has, the input node is returned.
	SDNode SelectionDAG::UpdateNodeOperands(SDNode N, SDValue Op) {
	assert(N->getNumOperands() == 1 && "Update with wrong number of operands");

	// Check to see if there is no change.
	if (Op == N->getOperand(0)) return N;

	// See if the modified node already exists.
	void *InsertPos = nullptr;
	if (SDNode *Existing = FindModifiedNodeSlot(N, Op, InsertPos))
	return Existing;

	// Nope it doesn't. Remove the node from its current place in the maps.
	if (InsertPos)
	if (!RemoveNodeFromCSEMaps(N))
	InsertPos = nullptr;

	// Now we update the operands.
	N->OperandList[0].set(Op);

	// If this gets put into a CSE map, add it.
	if (InsertPos) CSEMap.InsertNode(N, InsertPos);
	return N;
	}

	SDNode SelectionDAG::UpdateNodeOperands(SDNode N, SDValue Op1, SDValue Op2) {
	assert(N->getNumOperands() == 2 && "Update with wrong number of operands");

	// Check to see if there is no change.
	if (Op1 == N->getOperand(0) && Op2 == N->getOperand(1))
	return N; // No operands changed, just return the input node.

	// See if the modified node already exists.
	void *InsertPos = nullptr;
	if (SDNode *Existing = FindModifiedNodeSlot(N, Op1, Op2, InsertPos))
	return Existing;

	// Nope it doesn't. Remove the node from its current place in the maps.
	if (InsertPos)
	if (!RemoveNodeFromCSEMaps(N))
	InsertPos = nullptr;

	// Now we update the operands.
	if (N->OperandList[0] != Op1)
	N->OperandList[0].set(Op1);
	if (N->OperandList[1] != Op2)
	N->OperandList[1].set(Op2);

	// If this gets put into a CSE map, add it.
	if (InsertPos) CSEMap.InsertNode(N, InsertPos);
	return N;
	}

	SDNode *SelectionDAG::
	UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2, SDValue Op3) {
	SDValue Ops[] = { Op1, Op2, Op3 };
	return UpdateNodeOperands(N, Ops);
	}

	SDNode *SelectionDAG::
	UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2,
	SDValue Op3, SDValue Op4) {
	SDValue Ops[] = { Op1, Op2, Op3, Op4 };
	return UpdateNodeOperands(N, Ops);
	}

	SDNode *SelectionDAG::
	UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2,
	SDValue Op3, SDValue Op4, SDValue Op5) {
	SDValue Ops[] = { Op1, Op2, Op3, Op4, Op5 };
	return UpdateNodeOperands(N, Ops);
	}

	SDNode *SelectionDAG::
	UpdateNodeOperands(SDNode *N, ArrayRef<SDValue> Ops) {
	unsigned NumOps = Ops.size();
	assert(N->getNumOperands() == NumOps &&
	"Update with wrong number of operands");

	// If no operands changed just return the input node.
	if (std::equal(Ops.begin(), Ops.end(), N->op_begin()))
	return N;

	// See if the modified node already exists.
	void *InsertPos = nullptr;
	if (SDNode *Existing = FindModifiedNodeSlot(N, Ops, InsertPos))
	return Existing;

	// Nope it doesn't. Remove the node from its current place in the maps.
	if (InsertPos)
	if (!RemoveNodeFromCSEMaps(N))
	InsertPos = nullptr;

	// Now we update the operands.
	for (unsigned i = 0; i != NumOps; ++i)
	if (N->OperandList[i] != Ops[i])
	N->OperandList[i].set(Ops[i]);

	// If this gets put into a CSE map, add it.
	if (InsertPos) CSEMap.InsertNode(N, InsertPos);
	return N;
	}

	/// DropOperands - Release the operands and set this node to have
	/// zero operands.
	void SDNode::DropOperands() {
	// Unlike the code in MorphNodeTo that does this, we don't need to
	// watch for dead nodes here.
	for (op_iterator I = op_begin(), E = op_end(); I != E; ) {
	SDUse &Use = *I++;
	Use.set(SDValue());
	}
	}

	/// SelectNodeTo - These are wrappers around MorphNodeTo that accept a
	/// machine opcode.
	///
	SDNode SelectionDAG::SelectNodeTo(SDNode N, unsigned MachineOpc,
	EVT VT) {
	SDVTList VTs = getVTList(VT);
	return SelectNodeTo(N, MachineOpc, VTs, None);
	}

	SDNode SelectionDAG::SelectNodeTo(SDNode N, unsigned MachineOpc,
	EVT VT, SDValue Op1) {
	SDVTList VTs = getVTList(VT);
	SDValue Ops[] = { Op1 };
	return SelectNodeTo(N, MachineOpc, VTs, Ops);
	}

	SDNode SelectionDAG::SelectNodeTo(SDNode N, unsigned MachineOpc,
	EVT VT, SDValue Op1,
	SDValue Op2) {
	SDVTList VTs = getVTList(VT);
	SDValue Ops[] = { Op1, Op2 };
	return SelectNodeTo(N, MachineOpc, VTs, Ops);
	}

	SDNode SelectionDAG::SelectNodeTo(SDNode N, unsigned MachineOpc,
	EVT VT, SDValue Op1,
	SDValue Op2, SDValue Op3) {
	SDVTList VTs = getVTList(VT);
	SDValue Ops[] = { Op1, Op2, Op3 };
	return SelectNodeTo(N, MachineOpc, VTs, Ops);
	}

	SDNode SelectionDAG::SelectNodeTo(SDNode N, unsigned MachineOpc,
	EVT VT, ArrayRef<SDValue> Ops) {
	SDVTList VTs = getVTList(VT);
	return SelectNodeTo(N, MachineOpc, VTs, Ops);
	}

	SDNode SelectionDAG::SelectNodeTo(SDNode N, unsigned MachineOpc,
	EVT VT1, EVT VT2, ArrayRef<SDValue> Ops) {
	SDVTList VTs = getVTList(VT1, VT2);
	return SelectNodeTo(N, MachineOpc, VTs, Ops);
	}

	SDNode SelectionDAG::SelectNodeTo(SDNode N, unsigned MachineOpc,
	EVT VT1, EVT VT2) {
	SDVTList VTs = getVTList(VT1, VT2);
	return SelectNodeTo(N, MachineOpc, VTs, None);
	}

	SDNode SelectionDAG::SelectNodeTo(SDNode N, unsigned MachineOpc,
	EVT VT1, EVT VT2, EVT VT3,
	ArrayRef<SDValue> Ops) {
	SDVTList VTs = getVTList(VT1, VT2, VT3);
	return SelectNodeTo(N, MachineOpc, VTs, Ops);
	}

	SDNode SelectionDAG::SelectNodeTo(SDNode N, unsigned MachineOpc,
	EVT VT1, EVT VT2,
	SDValue Op1, SDValue Op2) {
	SDVTList VTs = getVTList(VT1, VT2);
	SDValue Ops[] = { Op1, Op2 };
	return SelectNodeTo(N, MachineOpc, VTs, Ops);
	}

	SDNode SelectionDAG::SelectNodeTo(SDNode N, unsigned MachineOpc,
	SDVTList VTs,ArrayRef<SDValue> Ops) {
	SDNode *New = MorphNodeTo(N, ~MachineOpc, VTs, Ops);
	// Reset the NodeID to -1.
	New->setNodeId(-1);
	if (New != N) {
	ReplaceAllUsesWith(N, New);
	RemoveDeadNode(N);
	}
	return New;
	}

	/// UpdateSDLocOnMergeSDNode - If the opt level is -O0 then it throws away
	/// the line number information on the merged node since it is not possible to
	/// preserve the information that operation is associated with multiple lines.
	/// This will make the debugger working better at -O0, were there is a higher
	/// probability having other instructions associated with that line.
	///
	/// For IROrder, we keep the smaller of the two
	SDNode SelectionDAG::UpdateSDLocOnMergeSDNode(SDNode N, const SDLoc &OLoc) {
	DebugLoc NLoc = N->getDebugLoc();
	if (NLoc && OptLevel == CodeGenOpt::None && OLoc.getDebugLoc() != NLoc) {
	N->setDebugLoc(DebugLoc());
	}
	unsigned Order = std::min(N->getIROrder(), OLoc.getIROrder());
	N->setIROrder(Order);
	return N;
	}

	/// MorphNodeTo - This mutates the specified node to have the specified
	/// return type, opcode, and operands.
	///
	/// Note that MorphNodeTo returns the resultant node. If there is already a
	/// node of the specified opcode and operands, it returns that node instead of
	/// the current one. Note that the SDLoc need not be the same.
	///
	/// Using MorphNodeTo is faster than creating a new node and swapping it in
	/// with ReplaceAllUsesWith both because it often avoids allocating a new
	/// node, and because it doesn't require CSE recalculation for any of
	/// the node's users.
	///
	/// However, note that MorphNodeTo recursively deletes dead nodes from the DAG.
	/// As a consequence it isn't appropriate to use from within the DAG combiner or
	/// the legalizer which maintain worklists that would need to be updated when
	/// deleting things.
	SDNode SelectionDAG::MorphNodeTo(SDNode N, unsigned Opc,
	SDVTList VTs, ArrayRef<SDValue> Ops) {
	// If an identical node already exists, use it.
	void *IP = nullptr;
	if (VTs.VTs[VTs.NumVTs-1] != MVT::Glue) {
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, Opc, VTs, Ops);
	if (SDNode *ON = FindNodeOrInsertPos(ID, SDLoc(N), IP))
	return UpdateSDLocOnMergeSDNode(ON, SDLoc(N));
	}

	if (!RemoveNodeFromCSEMaps(N))
	IP = nullptr;

	// Start the morphing.
	N->NodeType = Opc;
	N->ValueList = VTs.VTs;
	N->NumValues = VTs.NumVTs;

	// Clear the operands list, updating used nodes to remove this from their
	// use list. Keep track of any operands that become dead as a result.
	SmallPtrSet<SDNode*, 16> DeadNodeSet;
	for (SDNode::op_iterator I = N->op_begin(), E = N->op_end(); I != E; ) {
	SDUse &Use = *I++;
	SDNode *Used = Use.getNode();
	Use.set(SDValue());
	if (Used->use_empty())
	DeadNodeSet.insert(Used);
	}

	// For MachineNode, initialize the memory references information.
	if (MachineSDNode *MN = dyn_cast<MachineSDNode>(N))
	MN->setMemRefs(nullptr, nullptr);

	// Swap for an appropriately sized array from the recycler.
	removeOperands(N);
	createOperands(N, Ops);

	// Delete any nodes that are still dead after adding the uses for the
	// new operands.
	if (!DeadNodeSet.empty()) {
	SmallVector<SDNode *, 16> DeadNodes;
	for (SDNode *N : DeadNodeSet)
	if (N->use_empty())
	DeadNodes.push_back(N);
	RemoveDeadNodes(DeadNodes);
	}

	if (IP)
	CSEMap.InsertNode(N, IP); // Memoize the new node.
	return N;
	}

	SDNode* SelectionDAG::mutateStrictFPToFP(SDNode *Node) {
	unsigned OrigOpc = Node->getOpcode();
	unsigned NewOpc;
	bool IsUnary = false;
	switch (OrigOpc) {
	default:
	llvm_unreachable("mutateStrictFPToFP called with unexpected opcode!");
	case ISD::STRICT_FADD: NewOpc = ISD::FADD; break;
	case ISD::STRICT_FSUB: NewOpc = ISD::FSUB; break;
	case ISD::STRICT_FMUL: NewOpc = ISD::FMUL; break;
	case ISD::STRICT_FDIV: NewOpc = ISD::FDIV; break;
	case ISD::STRICT_FREM: NewOpc = ISD::FREM; break;
	case ISD::STRICT_FSQRT: NewOpc = ISD::FSQRT; IsUnary = true; break;
	case ISD::STRICT_FPOW: NewOpc = ISD::FPOW; break;
	case ISD::STRICT_FPOWI: NewOpc = ISD::FPOWI; break;
	case ISD::STRICT_FSIN: NewOpc = ISD::FSIN; IsUnary = true; break;
	case ISD::STRICT_FCOS: NewOpc = ISD::FCOS; IsUnary = true; break;
	case ISD::STRICT_FEXP: NewOpc = ISD::FEXP; IsUnary = true; break;
	case ISD::STRICT_FEXP2: NewOpc = ISD::FEXP2; IsUnary = true; break;
	case ISD::STRICT_FLOG: NewOpc = ISD::FLOG; IsUnary = true; break;
	case ISD::STRICT_FLOG10: NewOpc = ISD::FLOG10; IsUnary = true; break;
	case ISD::STRICT_FLOG2: NewOpc = ISD::FLOG2; IsUnary = true; break;
	case ISD::STRICT_FRINT: NewOpc = ISD::FRINT; IsUnary = true; break;
	case ISD::STRICT_FNEARBYINT:
	NewOpc = ISD::FNEARBYINT;
	IsUnary = true;
	break;
	}

	// We're taking this node out of the chain, so we need to re-link things.
	SDValue InputChain = Node->getOperand(0);
	SDValue OutputChain = SDValue(Node, 1);
	ReplaceAllUsesOfValueWith(OutputChain, InputChain);

	SDVTList VTs = getVTList(Node->getOperand(1).getValueType());
	SDNode *Res = nullptr;
	if (IsUnary)
	Res = MorphNodeTo(Node, NewOpc, VTs, { Node->getOperand(1) });
	else
	Res = MorphNodeTo(Node, NewOpc, VTs, { Node->getOperand(1),
	Node->getOperand(2) });

	// MorphNodeTo can operate in two ways: if an existing node with the
	// specified operands exists, it can just return it. Otherwise, it
	// updates the node in place to have the requested operands.
	if (Res == Node) {
	// If we updated the node in place, reset the node ID. To the isel,
	// this should be just like a newly allocated machine node.
	Res->setNodeId(-1);
	} else {
	ReplaceAllUsesWith(Node, Res);
	RemoveDeadNode(Node);
	}

	return Res;
	}

	/// getMachineNode - These are used for target selectors to create a new node
	/// with specified return type(s), MachineInstr opcode, and operands.
	///
	/// Note that getMachineNode returns the resultant node. If there is already a
	/// node of the specified opcode and operands, it returns that node instead of
	/// the current one.
	MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
	EVT VT) {
	SDVTList VTs = getVTList(VT);
	return getMachineNode(Opcode, dl, VTs, None);
	}

	MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
	EVT VT, SDValue Op1) {
	SDVTList VTs = getVTList(VT);
	SDValue Ops[] = { Op1 };
	return getMachineNode(Opcode, dl, VTs, Ops);
	}

	MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
	EVT VT, SDValue Op1, SDValue Op2) {
	SDVTList VTs = getVTList(VT);
	SDValue Ops[] = { Op1, Op2 };
	return getMachineNode(Opcode, dl, VTs, Ops);
	}

	MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
	EVT VT, SDValue Op1, SDValue Op2,
	SDValue Op3) {
	SDVTList VTs = getVTList(VT);
	SDValue Ops[] = { Op1, Op2, Op3 };
	return getMachineNode(Opcode, dl, VTs, Ops);
	}

	MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
	EVT VT, ArrayRef<SDValue> Ops) {
	SDVTList VTs = getVTList(VT);
	return getMachineNode(Opcode, dl, VTs, Ops);
	}

	MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
	EVT VT1, EVT VT2, SDValue Op1,
	SDValue Op2) {
	SDVTList VTs = getVTList(VT1, VT2);
	SDValue Ops[] = { Op1, Op2 };
	return getMachineNode(Opcode, dl, VTs, Ops);
	}

	MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
	EVT VT1, EVT VT2, SDValue Op1,
	SDValue Op2, SDValue Op3) {
	SDVTList VTs = getVTList(VT1, VT2);
	SDValue Ops[] = { Op1, Op2, Op3 };
	return getMachineNode(Opcode, dl, VTs, Ops);
	}

	MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
	EVT VT1, EVT VT2,
	ArrayRef<SDValue> Ops) {
	SDVTList VTs = getVTList(VT1, VT2);
	return getMachineNode(Opcode, dl, VTs, Ops);
	}

	MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
	EVT VT1, EVT VT2, EVT VT3,
	SDValue Op1, SDValue Op2) {
	SDVTList VTs = getVTList(VT1, VT2, VT3);
	SDValue Ops[] = { Op1, Op2 };
	return getMachineNode(Opcode, dl, VTs, Ops);
	}

	MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
	EVT VT1, EVT VT2, EVT VT3,
	SDValue Op1, SDValue Op2,
	SDValue Op3) {
	SDVTList VTs = getVTList(VT1, VT2, VT3);
	SDValue Ops[] = { Op1, Op2, Op3 };
	return getMachineNode(Opcode, dl, VTs, Ops);
	}

	MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
	EVT VT1, EVT VT2, EVT VT3,
	ArrayRef<SDValue> Ops) {
	SDVTList VTs = getVTList(VT1, VT2, VT3);
	return getMachineNode(Opcode, dl, VTs, Ops);
	}

	MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
	ArrayRef<EVT> ResultTys,
	ArrayRef<SDValue> Ops) {
	SDVTList VTs = getVTList(ResultTys);
	return getMachineNode(Opcode, dl, VTs, Ops);
	}

	MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &DL,
	SDVTList VTs,
	ArrayRef<SDValue> Ops) {
	bool DoCSE = VTs.VTs[VTs.NumVTs-1] != MVT::Glue;
	MachineSDNode *N;
	void *IP = nullptr;

	if (DoCSE) {
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, ~Opcode, VTs, Ops);
	IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP)) {
	return cast<MachineSDNode>(UpdateSDLocOnMergeSDNode(E, DL));
	}
	}

	// Allocate a new MachineSDNode.
	N = newSDNode<MachineSDNode>(~Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
	createOperands(N, Ops);

	if (DoCSE)
	CSEMap.InsertNode(N, IP);

	InsertNode(N);
	return N;
	}

	/// getTargetExtractSubreg - A convenience function for creating
	/// TargetOpcode::EXTRACT_SUBREG nodes.
	SDValue SelectionDAG::getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT,
	SDValue Operand) {
	SDValue SRIdxVal = getTargetConstant(SRIdx, DL, MVT::i32);
	SDNode *Subreg = getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
	VT, Operand, SRIdxVal);
	return SDValue(Subreg, 0);
	}

	/// getTargetInsertSubreg - A convenience function for creating
	/// TargetOpcode::INSERT_SUBREG nodes.
	SDValue SelectionDAG::getTargetInsertSubreg(int SRIdx, const SDLoc &DL, EVT VT,
	SDValue Operand, SDValue Subreg) {
	SDValue SRIdxVal = getTargetConstant(SRIdx, DL, MVT::i32);
	SDNode *Result = getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
	VT, Operand, Subreg, SRIdxVal);
	return SDValue(Result, 0);
	}

	/// getNodeIfExists - Get the specified node if it's already available, or
	/// else return NULL.
	SDNode *SelectionDAG::getNodeIfExists(unsigned Opcode, SDVTList VTList,
	ArrayRef<SDValue> Ops,
	const SDNodeFlags Flags) {
	if (VTList.VTs[VTList.NumVTs - 1] != MVT::Glue) {
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, Opcode, VTList, Ops);
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, SDLoc(), IP)) {
	E->intersectFlagsWith(Flags);
	return E;
	}
	}
	return nullptr;
	}

	/// getDbgValue - Creates a SDDbgValue node.
	///
	/// SDNode
	SDDbgValue SelectionDAG::getDbgValue(MDNode Var, MDNode Expr, SDNode N,
	unsigned R, bool IsIndirect, uint64_t Off,
	const DebugLoc &DL, unsigned O) {
	assert(cast<DILocalVariable>(Var)->isValidLocationForIntrinsic(DL) &&
	"Expected inlined-at fields to agree");
	return new (DbgInfo->getAlloc())
	SDDbgValue(Var, Expr, N, R, IsIndirect, Off, DL, O);
	}

	/// Constant
	SDDbgValue SelectionDAG::getConstantDbgValue(MDNode Var, MDNode *Expr,
	const Value *C, uint64_t Off,
	const DebugLoc &DL, unsigned O) {
	assert(cast<DILocalVariable>(Var)->isValidLocationForIntrinsic(DL) &&
	"Expected inlined-at fields to agree");
	return new (DbgInfo->getAlloc()) SDDbgValue(Var, Expr, C, Off, DL, O);
	}

	/// FrameIndex
	SDDbgValue SelectionDAG::getFrameIndexDbgValue(MDNode Var, MDNode *Expr,
	unsigned FI, uint64_t Off,
	const DebugLoc &DL,
	unsigned O) {
	assert(cast<DILocalVariable>(Var)->isValidLocationForIntrinsic(DL) &&
	"Expected inlined-at fields to agree");
	return new (DbgInfo->getAlloc()) SDDbgValue(Var, Expr, FI, Off, DL, O);
	}

	namespace {

	/// RAUWUpdateListener - Helper for ReplaceAllUsesWith - When the node
	/// pointed to by a use iterator is deleted, increment the use iterator
	/// so that it doesn't dangle.
	///
	class RAUWUpdateListener : public SelectionDAG::DAGUpdateListener {
	SDNode::use_iterator &UI;
	SDNode::use_iterator &UE;

	void NodeDeleted(SDNode N, SDNode E) override {
	// Increment the iterator as needed.
	while (UI != UE && N == *UI)
	++UI;
	}

	public:
	RAUWUpdateListener(SelectionDAG &d,
	SDNode::use_iterator &ui,
	SDNode::use_iterator &ue)
	: SelectionDAG::DAGUpdateListener(d), UI(ui), UE(ue) {}
	};

	} // end anonymous namespace

	/// ReplaceAllUsesWith - Modify anything using 'From' to use 'To' instead.
	/// This can cause recursive merging of nodes in the DAG.
	///
	/// This version assumes From has a single result value.
	///
	void SelectionDAG::ReplaceAllUsesWith(SDValue FromN, SDValue To) {
	SDNode *From = FromN.getNode();
	assert(From->getNumValues() == 1 && FromN.getResNo() == 0 &&
	"Cannot replace with this method!");
	assert(From != To.getNode() && "Cannot replace uses of with self");

	// Preserve Debug Values
	TransferDbgValues(FromN, To);

	// Iterate over all the existing uses of From. New uses will be added
	// to the beginning of the use list, which we avoid visiting.
	// This specifically avoids visiting uses of From that arise while the
	// replacement is happening, because any such uses would be the result
	// of CSE: If an existing node looks like From after one of its operands
	// is replaced by To, we don't want to replace of all its users with To
	// too. See PR3018 for more info.
	SDNode::use_iterator UI = From->use_begin(), UE = From->use_end();
	RAUWUpdateListener Listener(*this, UI, UE);
	while (UI != UE) {
	SDNode User = UI;

	// This node is about to morph, remove its old self from the CSE maps.
	RemoveNodeFromCSEMaps(User);

	// A user can appear in a use list multiple times, and when this
	// happens the uses are usually next to each other in the list.
	// To help reduce the number of CSE recomputations, process all
	// the uses of this user that we can find this way.
	do {
	SDUse &Use = UI.getUse();
	++UI;
	Use.set(To);
	} while (UI != UE && *UI == User);

	// Now that we have modified User, add it back to the CSE maps. If it
	// already exists there, recursively merge the results together.
	AddModifiedNodeToCSEMaps(User);
	}

	// If we just RAUW'd the root, take note.
	if (FromN == getRoot())
	setRoot(To);
	}

	/// ReplaceAllUsesWith - Modify anything using 'From' to use 'To' instead.
	/// This can cause recursive merging of nodes in the DAG.
	///
	/// This version assumes that for each value of From, there is a
	/// corresponding value in To in the same position with the same type.
	///
	void SelectionDAG::ReplaceAllUsesWith(SDNode From, SDNode To) {
	#ifndef NDEBUG
	for (unsigned i = 0, e = From->getNumValues(); i != e; ++i)
	assert((!From->hasAnyUseOfValue(i) \|\|
	From->getValueType(i) == To->getValueType(i)) &&
	"Cannot use this version of ReplaceAllUsesWith!");
	#endif

	// Handle the trivial case.
	if (From == To)
	return;

	// Preserve Debug Info. Only do this if there's a use.
	for (unsigned i = 0, e = From->getNumValues(); i != e; ++i)
	if (From->hasAnyUseOfValue(i)) {
	assert((i < To->getNumValues()) && "Invalid To location");
	TransferDbgValues(SDValue(From, i), SDValue(To, i));
	}

	// Iterate over just the existing users of From. See the comments in
	// the ReplaceAllUsesWith above.
	SDNode::use_iterator UI = From->use_begin(), UE = From->use_end();
	RAUWUpdateListener Listener(*this, UI, UE);
	while (UI != UE) {
	SDNode User = UI;

	// This node is about to morph, remove its old self from the CSE maps.
	RemoveNodeFromCSEMaps(User);

	// A user can appear in a use list multiple times, and when this
	// happens the uses are usually next to each other in the list.
	// To help reduce the number of CSE recomputations, process all
	// the uses of this user that we can find this way.
	do {
	SDUse &Use = UI.getUse();
	++UI;
	Use.setNode(To);
	} while (UI != UE && *UI == User);

	// Now that we have modified User, add it back to the CSE maps. If it
	// already exists there, recursively merge the results together.
	AddModifiedNodeToCSEMaps(User);
	}

	// If we just RAUW'd the root, take note.
	if (From == getRoot().getNode())
	setRoot(SDValue(To, getRoot().getResNo()));
	}

	/// ReplaceAllUsesWith - Modify anything using 'From' to use 'To' instead.
	/// This can cause recursive merging of nodes in the DAG.
	///
	/// This version can replace From with any result values. To must match the
	/// number and types of values returned by From.
	void SelectionDAG::ReplaceAllUsesWith(SDNode From, const SDValue To) {
	if (From->getNumValues() == 1) // Handle the simple case efficiently.
	return ReplaceAllUsesWith(SDValue(From, 0), To[0]);

	// Preserve Debug Info.
	for (unsigned i = 0, e = From->getNumValues(); i != e; ++i)
	TransferDbgValues(SDValue(From, i), *To);

	// Iterate over just the existing users of From. See the comments in
	// the ReplaceAllUsesWith above.
	SDNode::use_iterator UI = From->use_begin(), UE = From->use_end();
	RAUWUpdateListener Listener(*this, UI, UE);
	while (UI != UE) {
	SDNode User = UI;

	// This node is about to morph, remove its old self from the CSE maps.
	RemoveNodeFromCSEMaps(User);

	// A user can appear in a use list multiple times, and when this
	// happens the uses are usually next to each other in the list.
	// To help reduce the number of CSE recomputations, process all
	// the uses of this user that we can find this way.
	do {
	SDUse &Use = UI.getUse();
	const SDValue &ToOp = To[Use.getResNo()];
	++UI;
	Use.set(ToOp);
	} while (UI != UE && *UI == User);

	// Now that we have modified User, add it back to the CSE maps. If it
	// already exists there, recursively merge the results together.
	AddModifiedNodeToCSEMaps(User);
	}

	// If we just RAUW'd the root, take note.
	if (From == getRoot().getNode())
	setRoot(SDValue(To[getRoot().getResNo()]));
	}

	/// ReplaceAllUsesOfValueWith - Replace any uses of From with To, leaving
	/// uses of other values produced by From.getNode() alone. The Deleted
	/// vector is handled the same way as for ReplaceAllUsesWith.
	void SelectionDAG::ReplaceAllUsesOfValueWith(SDValue From, SDValue To){
	// Handle the really simple, really trivial case efficiently.
	if (From == To) return;

	// Handle the simple, trivial, case efficiently.
	if (From.getNode()->getNumValues() == 1) {
	ReplaceAllUsesWith(From, To);
	return;
	}

	// Preserve Debug Info.
	TransferDbgValues(From, To);

	// Iterate over just the existing users of From. See the comments in
	// the ReplaceAllUsesWith above.
	SDNode::use_iterator UI = From.getNode()->use_begin(),
	UE = From.getNode()->use_end();
	RAUWUpdateListener Listener(*this, UI, UE);
	while (UI != UE) {
	SDNode User = UI;
	bool UserRemovedFromCSEMaps = false;

	// A user can appear in a use list multiple times, and when this
	// happens the uses are usually next to each other in the list.
	// To help reduce the number of CSE recomputations, process all
	// the uses of this user that we can find this way.
	do {
	SDUse &Use = UI.getUse();

	// Skip uses of different values from the same node.
	if (Use.getResNo() != From.getResNo()) {
	++UI;
	continue;
	}

	// If this node hasn't been modified yet, it's still in the CSE maps,
	// so remove its old self from the CSE maps.
	if (!UserRemovedFromCSEMaps) {
	RemoveNodeFromCSEMaps(User);
	UserRemovedFromCSEMaps = true;
	}

	++UI;
	Use.set(To);
	} while (UI != UE && *UI == User);

	// We are iterating over all uses of the From node, so if a use
	// doesn't use the specific value, no changes are made.
	if (!UserRemovedFromCSEMaps)
	continue;

	// Now that we have modified User, add it back to the CSE maps. If it
	// already exists there, recursively merge the results together.
	AddModifiedNodeToCSEMaps(User);
	}

	// If we just RAUW'd the root, take note.
	if (From == getRoot())
	setRoot(To);
	}

	namespace {

	/// UseMemo - This class is used by SelectionDAG::ReplaceAllUsesOfValuesWith
	/// to record information about a use.
	struct UseMemo {
	SDNode *User;
	unsigned Index;
	SDUse *Use;
	};

	/// operator< - Sort Memos by User.
	bool operator<(const UseMemo &L, const UseMemo &R) {
	return (intptr_t)L.User < (intptr_t)R.User;
	}

	} // end anonymous namespace

	/// ReplaceAllUsesOfValuesWith - Replace any uses of From with To, leaving
	/// uses of other values produced by From.getNode() alone. The same value
	/// may appear in both the From and To list. The Deleted vector is
	/// handled the same way as for ReplaceAllUsesWith.
	void SelectionDAG::ReplaceAllUsesOfValuesWith(const SDValue *From,
	const SDValue *To,
	unsigned Num){
	// Handle the simple, trivial case efficiently.
	if (Num == 1)
	return ReplaceAllUsesOfValueWith(From, To);

	TransferDbgValues(From, To);

	// Read up all the uses and make records of them. This helps
	// processing new uses that are introduced during the
	// replacement process.
	SmallVector<UseMemo, 4> Uses;
	for (unsigned i = 0; i != Num; ++i) {
	unsigned FromResNo = From[i].getResNo();
	SDNode *FromNode = From[i].getNode();
	for (SDNode::use_iterator UI = FromNode->use_begin(),
	E = FromNode->use_end(); UI != E; ++UI) {
	SDUse &Use = UI.getUse();
	if (Use.getResNo() == FromResNo) {
	UseMemo Memo = { *UI, i, &Use };
	Uses.push_back(Memo);
	}
	}
	}

	// Sort the uses, so that all the uses from a given User are together.
	std::sort(Uses.begin(), Uses.end());

	for (unsigned UseIndex = 0, UseIndexEnd = Uses.size();
	UseIndex != UseIndexEnd; ) {
	// We know that this user uses some value of From. If it is the right
	// value, update it.
	SDNode *User = Uses[UseIndex].User;

	// This node is about to morph, remove its old self from the CSE maps.
	RemoveNodeFromCSEMaps(User);

	// The Uses array is sorted, so all the uses for a given User
	// are next to each other in the list.
	// To help reduce the number of CSE recomputations, process all
	// the uses of this user that we can find this way.
	do {
	unsigned i = Uses[UseIndex].Index;
	SDUse &Use = *Uses[UseIndex].Use;
	++UseIndex;

	Use.set(To[i]);
	} while (UseIndex != UseIndexEnd && Uses[UseIndex].User == User);

	// Now that we have modified User, add it back to the CSE maps. If it
	// already exists there, recursively merge the results together.
	AddModifiedNodeToCSEMaps(User);
	}
	}

	/// AssignTopologicalOrder - Assign a unique node id for each node in the DAG
	/// based on their topological order. It returns the maximum id and a vector
	/// of the SDNodes* in assigned order by reference.
	unsigned SelectionDAG::AssignTopologicalOrder() {
	unsigned DAGSize = 0;

	// SortedPos tracks the progress of the algorithm. Nodes before it are
	// sorted, nodes after it are unsorted. When the algorithm completes
	// it is at the end of the list.
	allnodes_iterator SortedPos = allnodes_begin();

	// Visit all the nodes. Move nodes with no operands to the front of
	// the list immediately. Annotate nodes that do have operands with their
	// operand count. Before we do this, the Node Id fields of the nodes
	// may contain arbitrary values. After, the Node Id fields for nodes
	// before SortedPos will contain the topological sort index, and the
	// Node Id fields for nodes At SortedPos and after will contain the
	// count of outstanding operands.
	for (allnodes_iterator I = allnodes_begin(),E = allnodes_end(); I != E; ) {
	SDNode N = &I++;
	checkForCycles(N, this);
	unsigned Degree = N->getNumOperands();
	if (Degree == 0) {
	// A node with no uses, add it to the result array immediately.
	N->setNodeId(DAGSize++);
	allnodes_iterator Q(N);
	if (Q != SortedPos)
	SortedPos = AllNodes.insert(SortedPos, AllNodes.remove(Q));
	assert(SortedPos != AllNodes.end() && "Overran node list");
	++SortedPos;
	} else {
	// Temporarily use the Node Id as scratch space for the degree count.
	N->setNodeId(Degree);
	}
	}

	// Visit all the nodes. As we iterate, move nodes into sorted order,
	// such that by the time the end is reached all nodes will be sorted.
	for (SDNode &Node : allnodes()) {
	SDNode *N = &Node;
	checkForCycles(N, this);
	// N is in sorted position, so all its uses have one less operand
	// that needs to be sorted.
	for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
	UI != UE; ++UI) {
	SDNode P = UI;
	unsigned Degree = P->getNodeId();
	assert(Degree != 0 && "Invalid node degree");
	--Degree;
	if (Degree == 0) {
	// All of P's operands are sorted, so P may sorted now.
	P->setNodeId(DAGSize++);
	if (P->getIterator() != SortedPos)
	SortedPos = AllNodes.insert(SortedPos, AllNodes.remove(P));
	assert(SortedPos != AllNodes.end() && "Overran node list");
	++SortedPos;
	} else {
	// Update P's outstanding operand count.
	P->setNodeId(Degree);
	}
	}
	if (Node.getIterator() == SortedPos) {
	#ifndef NDEBUG
	allnodes_iterator I(N);
	SDNode S = &++I;
	dbgs() << "Overran sorted position:\n";
	S->dumprFull(this); dbgs() << "\n";
	dbgs() << "Checking if this is due to cycles\n";
	checkForCycles(this, true);
	#endif
	llvm_unreachable(nullptr);
	}
	}

	assert(SortedPos == AllNodes.end() &&
	"Topological sort incomplete!");
	assert(AllNodes.front().getOpcode() == ISD::EntryToken &&
	"First node in topological sort is not the entry token!");
	assert(AllNodes.front().getNodeId() == 0 &&
	"First node in topological sort has non-zero id!");
	assert(AllNodes.front().getNumOperands() == 0 &&
	"First node in topological sort has operands!");
	assert(AllNodes.back().getNodeId() == (int)DAGSize-1 &&
	"Last node in topologic sort has unexpected id!");
	assert(AllNodes.back().use_empty() &&
	"Last node in topologic sort has users!");
	assert(DAGSize == allnodes_size() && "Node count mismatch!");
	return DAGSize;
	}

	/// AddDbgValue - Add a dbg_value SDNode. If SD is non-null that means the
	/// value is produced by SD.
	void SelectionDAG::AddDbgValue(SDDbgValue DB, SDNode SD, bool isParameter) {
	if (SD) {
	assert(DbgInfo->getSDDbgValues(SD).empty() \|\| SD->getHasDebugValue());
	SD->setHasDebugValue(true);
	}
	DbgInfo->add(DB, SD, isParameter);
	}

	/// TransferDbgValues - Transfer SDDbgValues. Called in replace nodes.
	void SelectionDAG::TransferDbgValues(SDValue From, SDValue To) {
	if (From == To \|\| !From.getNode()->getHasDebugValue())
	return;
	SDNode *FromNode = From.getNode();
	SDNode *ToNode = To.getNode();
	ArrayRef<SDDbgValue *> DVs = GetDbgValues(FromNode);
	SmallVector<SDDbgValue *, 2> ClonedDVs;
	for (ArrayRef<SDDbgValue *>::iterator I = DVs.begin(), E = DVs.end();
	I != E; ++I) {
	SDDbgValue Dbg = I;
	// Only add Dbgvalues attached to same ResNo.
	if (Dbg->getKind() == SDDbgValue::SDNODE &&
	Dbg->getSDNode() == From.getNode() &&
	Dbg->getResNo() == From.getResNo() && !Dbg->isInvalidated()) {
	assert(FromNode != ToNode &&
	"Should not transfer Debug Values intranode");
	SDDbgValue *Clone =
	getDbgValue(Dbg->getVariable(), Dbg->getExpression(), ToNode,
	To.getResNo(), Dbg->isIndirect(), Dbg->getOffset(),
	Dbg->getDebugLoc(), Dbg->getOrder());
	ClonedDVs.push_back(Clone);
	Dbg->setIsInvalidated();
	}
	}
	for (SDDbgValue *I : ClonedDVs)
	AddDbgValue(I, ToNode, false);
	}

	-void SelectionDAG::makeEquivalentMemoryOrdering(LoadSDNode *OldLoad,
	- SDValue NewMemOp) {
	+SDValue SelectionDAG::makeEquivalentMemoryOrdering(LoadSDNode *OldLoad,
	+ SDValue NewMemOp) {
	assert(isa<MemSDNode>(NewMemOp.getNode()) && "Expected a memop node");
	- if (!OldLoad->hasAnyUseOfValue(1))
	- return;
	-
	// The new memory operation must have the same position as the old load in
	// terms of memory dependency. Create a TokenFactor for the old load and new
	// memory operation and update uses of the old load's output chain to use that
	// TokenFactor.
	SDValue OldChain = SDValue(OldLoad, 1);
	SDValue NewChain = SDValue(NewMemOp.getNode(), 1);
	+ if (!OldLoad->hasAnyUseOfValue(1))
	+ return NewChain;
	+
	SDValue TokenFactor =
	getNode(ISD::TokenFactor, SDLoc(OldLoad), MVT::Other, OldChain, NewChain);
	ReplaceAllUsesOfValueWith(OldChain, TokenFactor);
	UpdateNodeOperands(TokenFactor.getNode(), OldChain, NewChain);
	+ return TokenFactor;
	}

	//===----------------------------------------------------------------------===//
	// SDNode Class
	//===----------------------------------------------------------------------===//

	bool llvm::isNullConstant(SDValue V) {
	ConstantSDNode *Const = dyn_cast<ConstantSDNode>(V);
	return Const != nullptr && Const->isNullValue();
	}

	bool llvm::isNullFPConstant(SDValue V) {
	ConstantFPSDNode *Const = dyn_cast<ConstantFPSDNode>(V);
	return Const != nullptr && Const->isZero() && !Const->isNegative();
	}

	bool llvm::isAllOnesConstant(SDValue V) {
	ConstantSDNode *Const = dyn_cast<ConstantSDNode>(V);
	return Const != nullptr && Const->isAllOnesValue();
	}

	bool llvm::isOneConstant(SDValue V) {
	ConstantSDNode *Const = dyn_cast<ConstantSDNode>(V);
	return Const != nullptr && Const->isOne();
	}

	bool llvm::isBitwiseNot(SDValue V) {
	return V.getOpcode() == ISD::XOR && isAllOnesConstant(V.getOperand(1));
	}

	ConstantSDNode *llvm::isConstOrConstSplat(SDValue N) {
	if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N))
	return CN;

	if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(N)) {
	BitVector UndefElements;
	ConstantSDNode *CN = BV->getConstantSplatNode(&UndefElements);

	// BuildVectors can truncate their operands. Ignore that case here.
	// FIXME: We blindly ignore splats which include undef which is overly
	// pessimistic.
	if (CN && UndefElements.none() &&
	CN->getValueType(0) == N.getValueType().getScalarType())
	return CN;
	}

	return nullptr;
	}

	ConstantFPSDNode *llvm::isConstOrConstSplatFP(SDValue N) {
	if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N))
	return CN;

	if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(N)) {
	BitVector UndefElements;
	ConstantFPSDNode *CN = BV->getConstantFPSplatNode(&UndefElements);

	if (CN && UndefElements.none())
	return CN;
	}

	return nullptr;
	}

	HandleSDNode::~HandleSDNode() {
	DropOperands();
	}

	GlobalAddressSDNode::GlobalAddressSDNode(unsigned Opc, unsigned Order,
	const DebugLoc &DL,
	const GlobalValue *GA, EVT VT,
	int64_t o, unsigned char TF)
	: SDNode(Opc, Order, DL, getSDVTList(VT)), Offset(o), TargetFlags(TF) {
	TheGlobal = GA;
	}

	AddrSpaceCastSDNode::AddrSpaceCastSDNode(unsigned Order, const DebugLoc &dl,
	EVT VT, unsigned SrcAS,
	unsigned DestAS)
	: SDNode(ISD::ADDRSPACECAST, Order, dl, getSDVTList(VT)),
	SrcAddrSpace(SrcAS), DestAddrSpace(DestAS) {}

	MemSDNode::MemSDNode(unsigned Opc, unsigned Order, const DebugLoc &dl,
	SDVTList VTs, EVT memvt, MachineMemOperand *mmo)
	: SDNode(Opc, Order, dl, VTs), MemoryVT(memvt), MMO(mmo) {
	MemSDNodeBits.IsVolatile = MMO->isVolatile();
	MemSDNodeBits.IsNonTemporal = MMO->isNonTemporal();
	MemSDNodeBits.IsDereferenceable = MMO->isDereferenceable();
	MemSDNodeBits.IsInvariant = MMO->isInvariant();

	// We check here that the size of the memory operand fits within the size of
	// the MMO. This is because the MMO might indicate only a possible address
	// range instead of specifying the affected memory addresses precisely.
	assert(memvt.getStoreSize() <= MMO->getSize() && "Size mismatch!");
	}

	/// Profile - Gather unique data for the node.
	///
	void SDNode::Profile(FoldingSetNodeID &ID) const {
	AddNodeIDNode(ID, this);
	}

	namespace {

	struct EVTArray {
	std::vector<EVT> VTs;

	EVTArray() {
	VTs.reserve(MVT::LAST_VALUETYPE);
	for (unsigned i = 0; i < MVT::LAST_VALUETYPE; ++i)
	VTs.push_back(MVT((MVT::SimpleValueType)i));
	}
	};

	} // end anonymous namespace

	static ManagedStatic<std::set<EVT, EVT::compareRawBits>> EVTs;
	static ManagedStatic<EVTArray> SimpleVTArray;
	static ManagedStatic<sys::SmartMutex<true>> VTMutex;

	/// getValueTypeList - Return a pointer to the specified value type.
	///
	const EVT *SDNode::getValueTypeList(EVT VT) {
	if (VT.isExtended()) {
	sys::SmartScopedLock<true> Lock(*VTMutex);
	return &(*EVTs->insert(VT).first);
	} else {
	assert(VT.getSimpleVT() < MVT::LAST_VALUETYPE &&
	"Value type out of range!");
	return &SimpleVTArray->VTs[VT.getSimpleVT().SimpleTy];
	}
	}

	/// hasNUsesOfValue - Return true if there are exactly NUSES uses of the
	/// indicated value. This method ignores uses of other values defined by this
	/// operation.
	bool SDNode::hasNUsesOfValue(unsigned NUses, unsigned Value) const {
	assert(Value < getNumValues() && "Bad value!");

	// TODO: Only iterate over uses of a given value of the node
	for (SDNode::use_iterator UI = use_begin(), E = use_end(); UI != E; ++UI) {
	if (UI.getUse().getResNo() == Value) {
	if (NUses == 0)
	return false;
	--NUses;
	}
	}

	// Found exactly the right number of uses?
	return NUses == 0;
	}

	/// hasAnyUseOfValue - Return true if there are any use of the indicated
	/// value. This method ignores uses of other values defined by this operation.
	bool SDNode::hasAnyUseOfValue(unsigned Value) const {
	assert(Value < getNumValues() && "Bad value!");

	for (SDNode::use_iterator UI = use_begin(), E = use_end(); UI != E; ++UI)
	if (UI.getUse().getResNo() == Value)
	return true;

	return false;
	}

	/// isOnlyUserOf - Return true if this node is the only use of N.
	bool SDNode::isOnlyUserOf(const SDNode *N) const {
	bool Seen = false;
	for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) {
	SDNode User = I;
	if (User == this)
	Seen = true;
	else
	return false;
	}

	return Seen;
	}

	/// Return true if the only users of N are contained in Nodes.
	bool SDNode::areOnlyUsersOf(ArrayRef<const SDNode > Nodes, const SDNode N) {
	bool Seen = false;
	for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) {
	SDNode User = I;
	if (llvm::any_of(Nodes,
	[&User](const SDNode *Node) { return User == Node; }))
	Seen = true;
	else
	return false;
	}

	return Seen;
	}

	/// isOperand - Return true if this node is an operand of N.
	bool SDValue::isOperandOf(const SDNode *N) const {
	for (const SDValue &Op : N->op_values())
	if (*this == Op)
	return true;
	return false;
	}

	bool SDNode::isOperandOf(const SDNode *N) const {
	for (const SDValue &Op : N->op_values())
	if (this == Op.getNode())
	return true;
	return false;
	}

	/// reachesChainWithoutSideEffects - Return true if this operand (which must
	/// be a chain) reaches the specified operand without crossing any
	/// side-effecting instructions on any chain path. In practice, this looks
	/// through token factors and non-volatile loads. In order to remain efficient,
	/// this only looks a couple of nodes in, it does not do an exhaustive search.
	///
	/// Note that we only need to examine chains when we're searching for
	/// side-effects; SelectionDAG requires that all side-effects are represented
	/// by chains, even if another operand would force a specific ordering. This
	/// constraint is necessary to allow transformations like splitting loads.
	bool SDValue::reachesChainWithoutSideEffects(SDValue Dest,
	unsigned Depth) const {
	if (*this == Dest) return true;

	// Don't search too deeply, we just want to be able to see through
	// TokenFactor's etc.
	if (Depth == 0) return false;

	// If this is a token factor, all inputs to the TF happen in parallel.
	if (getOpcode() == ISD::TokenFactor) {
	// First, try a shallow search.
	if (is_contained((*this)->ops(), Dest)) {
	// We found the chain we want as an operand of this TokenFactor.
	// Essentially, we reach the chain without side-effects if we could
	// serialize the TokenFactor into a simple chain of operations with
	// Dest as the last operation. This is automatically true if the
	// chain has one use: there are no other ordering constraints.
	// If the chain has more than one use, we give up: some other
	// use of Dest might force a side-effect between Dest and the current
	// node.
	if (Dest.hasOneUse())
	return true;
	}
	// Next, try a deep search: check whether every operand of the TokenFactor
	// reaches Dest.
	return llvm::all_of((*this)->ops(), [=](SDValue Op) {
	return Op.reachesChainWithoutSideEffects(Dest, Depth - 1);
	});
	}

	// Loads don't have side effects, look through them.
	if (LoadSDNode Ld = dyn_cast<LoadSDNode>(this)) {
	if (!Ld->isVolatile())
	return Ld->getChain().reachesChainWithoutSideEffects(Dest, Depth-1);
	}
	return false;
	}

	bool SDNode::hasPredecessor(const SDNode *N) const {
	SmallPtrSet<const SDNode *, 32> Visited;
	SmallVector<const SDNode *, 16> Worklist;
	Worklist.push_back(this);
	return hasPredecessorHelper(N, Visited, Worklist);
	}

	void SDNode::intersectFlagsWith(const SDNodeFlags Flags) {
	this->Flags.intersectWith(Flags);
	}

	SDValue SelectionDAG::UnrollVectorOp(SDNode *N, unsigned ResNE) {
	assert(N->getNumValues() == 1 &&
	"Can't unroll a vector with multiple results!");

	EVT VT = N->getValueType(0);
	unsigned NE = VT.getVectorNumElements();
	EVT EltVT = VT.getVectorElementType();
	SDLoc dl(N);

	SmallVector<SDValue, 8> Scalars;
	SmallVector<SDValue, 4> Operands(N->getNumOperands());

	// If ResNE is 0, fully unroll the vector op.
	if (ResNE == 0)
	ResNE = NE;
	else if (NE > ResNE)
	NE = ResNE;

	unsigned i;
	for (i= 0; i != NE; ++i) {
	for (unsigned j = 0, e = N->getNumOperands(); j != e; ++j) {
	SDValue Operand = N->getOperand(j);
	EVT OperandVT = Operand.getValueType();
	if (OperandVT.isVector()) {
	// A vector operand; extract a single element.
	EVT OperandEltVT = OperandVT.getVectorElementType();
	Operands[j] =
	getNode(ISD::EXTRACT_VECTOR_ELT, dl, OperandEltVT, Operand,
	getConstant(i, dl, TLI->getVectorIdxTy(getDataLayout())));
	} else {
	// A scalar operand; just use it as is.
	Operands[j] = Operand;
	}
	}

	switch (N->getOpcode()) {
	default: {
	Scalars.push_back(getNode(N->getOpcode(), dl, EltVT, Operands,
	N->getFlags()));
	break;
	}
	case ISD::VSELECT:
	Scalars.push_back(getNode(ISD::SELECT, dl, EltVT, Operands));
	break;
	case ISD::SHL:
	case ISD::SRA:
	case ISD::SRL:
	case ISD::ROTL:
	case ISD::ROTR:
	Scalars.push_back(getNode(N->getOpcode(), dl, EltVT, Operands[0],
	getShiftAmountOperand(Operands[0].getValueType(),
	Operands[1])));
	break;
	case ISD::SIGN_EXTEND_INREG:
	case ISD::FP_ROUND_INREG: {
	EVT ExtVT = cast<VTSDNode>(Operands[1])->getVT().getVectorElementType();
	Scalars.push_back(getNode(N->getOpcode(), dl, EltVT,
	Operands[0],
	getValueType(ExtVT)));
	}
	}
	}

	for (; i < ResNE; ++i)
	Scalars.push_back(getUNDEF(EltVT));

	EVT VecVT = EVT::getVectorVT(*getContext(), EltVT, ResNE);
	return getBuildVector(VecVT, dl, Scalars);
	}

	bool SelectionDAG::areNonVolatileConsecutiveLoads(LoadSDNode *LD,
	LoadSDNode *Base,
	unsigned Bytes,
	int Dist) const {
	if (LD->isVolatile() \|\| Base->isVolatile())
	return false;
	if (LD->isIndexed() \|\| Base->isIndexed())
	return false;
	if (LD->getChain() != Base->getChain())
	return false;
	EVT VT = LD->getValueType(0);
	if (VT.getSizeInBits() / 8 != Bytes)
	return false;

	SDValue Loc = LD->getOperand(1);
	SDValue BaseLoc = Base->getOperand(1);

	auto BaseLocDecomp = BaseIndexOffset::match(BaseLoc, *this);
	auto LocDecomp = BaseIndexOffset::match(Loc, *this);

	int64_t Offset = 0;
	if (BaseLocDecomp.equalBaseIndex(LocDecomp, *this, Offset))
	return (Dist * Bytes == Offset);
	return false;
	}

	/// InferPtrAlignment - Infer alignment of a load / store address. Return 0 if
	/// it cannot be inferred.
	unsigned SelectionDAG::InferPtrAlignment(SDValue Ptr) const {
	// If this is a GlobalAddress + cst, return the alignment.
	const GlobalValue *GV;
	int64_t GVOffset = 0;
	if (TLI->isGAPlusOffset(Ptr.getNode(), GV, GVOffset)) {
	unsigned PtrWidth = getDataLayout().getPointerTypeSizeInBits(GV->getType());
	KnownBits Known(PtrWidth);
	llvm::computeKnownBits(GV, Known, getDataLayout());
	unsigned AlignBits = Known.countMinTrailingZeros();
	unsigned Align = AlignBits ? 1 << std::min(31U, AlignBits) : 0;
	if (Align)
	return MinAlign(Align, GVOffset);
	}

	// If this is a direct reference to a stack slot, use information about the
	// stack slot's alignment.
	int FrameIdx = 1 << 31;
	int64_t FrameOffset = 0;
	if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Ptr)) {
	FrameIdx = FI->getIndex();
	} else if (isBaseWithConstantOffset(Ptr) &&
	isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
	// Handle FI+Cst
	FrameIdx = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
	FrameOffset = Ptr.getConstantOperandVal(1);
	}

	if (FrameIdx != (1 << 31)) {
	const MachineFrameInfo &MFI = getMachineFunction().getFrameInfo();
	unsigned FIInfoAlign = MinAlign(MFI.getObjectAlignment(FrameIdx),
	FrameOffset);
	return FIInfoAlign;
	}

	return 0;
	}

	/// GetSplitDestVTs - Compute the VTs needed for the low/hi parts of a type
	/// which is split (or expanded) into two not necessarily identical pieces.
	std::pair<EVT, EVT> SelectionDAG::GetSplitDestVTs(const EVT &VT) const {
	// Currently all types are split in half.
	EVT LoVT, HiVT;
	if (!VT.isVector())
	LoVT = HiVT = TLI->getTypeToTransformTo(*getContext(), VT);
	else
	LoVT = HiVT = VT.getHalfNumVectorElementsVT(*getContext());

	return std::make_pair(LoVT, HiVT);
	}

	/// SplitVector - Split the vector with EXTRACT_SUBVECTOR and return the
	/// low/high part.
	std::pair<SDValue, SDValue>
	SelectionDAG::SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT,
	const EVT &HiVT) {
	assert(LoVT.getVectorNumElements() + HiVT.getVectorNumElements() <=
	N.getValueType().getVectorNumElements() &&
	"More vector elements requested than available!");
	SDValue Lo, Hi;
	Lo = getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N,
	getConstant(0, DL, TLI->getVectorIdxTy(getDataLayout())));
	Hi = getNode(ISD::EXTRACT_SUBVECTOR, DL, HiVT, N,
	getConstant(LoVT.getVectorNumElements(), DL,
	TLI->getVectorIdxTy(getDataLayout())));
	return std::make_pair(Lo, Hi);
	}

	void SelectionDAG::ExtractVectorElements(SDValue Op,
	SmallVectorImpl<SDValue> &Args,
	unsigned Start, unsigned Count) {
	EVT VT = Op.getValueType();
	if (Count == 0)
	Count = VT.getVectorNumElements();

	EVT EltVT = VT.getVectorElementType();
	EVT IdxTy = TLI->getVectorIdxTy(getDataLayout());
	SDLoc SL(Op);
	for (unsigned i = Start, e = Start + Count; i != e; ++i) {
	Args.push_back(getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
	Op, getConstant(i, SL, IdxTy)));
	}
	}

	// getAddressSpace - Return the address space this GlobalAddress belongs to.
	unsigned GlobalAddressSDNode::getAddressSpace() const {
	return getGlobal()->getType()->getAddressSpace();
	}

	Type *ConstantPoolSDNode::getType() const {
	if (isMachineConstantPoolEntry())
	return Val.MachineCPVal->getType();
	return Val.ConstVal->getType();
	}

	bool BuildVectorSDNode::isConstantSplat(APInt &SplatValue, APInt &SplatUndef,
	unsigned &SplatBitSize,
	bool &HasAnyUndefs,
	unsigned MinSplatBits,
	bool IsBigEndian) const {
	EVT VT = getValueType(0);
	assert(VT.isVector() && "Expected a vector type");
	unsigned VecWidth = VT.getSizeInBits();
	if (MinSplatBits > VecWidth)
	return false;

	// FIXME: The widths are based on this node's type, but build vectors can
	// truncate their operands.
	SplatValue = APInt(VecWidth, 0);
	SplatUndef = APInt(VecWidth, 0);

	// Get the bits. Bits with undefined values (when the corresponding element
	// of the vector is an ISD::UNDEF value) are set in SplatUndef and cleared
	// in SplatValue. If any of the values are not constant, give up and return
	// false.
	unsigned int NumOps = getNumOperands();
	assert(NumOps > 0 && "isConstantSplat has 0-size build vector");
	unsigned EltWidth = VT.getScalarSizeInBits();

	for (unsigned j = 0; j < NumOps; ++j) {
	unsigned i = IsBigEndian ? NumOps - 1 - j : j;
	SDValue OpVal = getOperand(i);
	unsigned BitPos = j * EltWidth;

	if (OpVal.isUndef())
	SplatUndef.setBits(BitPos, BitPos + EltWidth);
	else if (auto *CN = dyn_cast<ConstantSDNode>(OpVal))
	SplatValue.insertBits(CN->getAPIntValue().zextOrTrunc(EltWidth), BitPos);
	else if (auto *CN = dyn_cast<ConstantFPSDNode>(OpVal))
	SplatValue.insertBits(CN->getValueAPF().bitcastToAPInt(), BitPos);
	else
	return false;
	}

	// The build_vector is all constants or undefs. Find the smallest element
	// size that splats the vector.
	HasAnyUndefs = (SplatUndef != 0);

	// FIXME: This does not work for vectors with elements less than 8 bits.
	while (VecWidth > 8) {
	unsigned HalfSize = VecWidth / 2;
	APInt HighValue = SplatValue.lshr(HalfSize).trunc(HalfSize);
	APInt LowValue = SplatValue.trunc(HalfSize);
	APInt HighUndef = SplatUndef.lshr(HalfSize).trunc(HalfSize);
	APInt LowUndef = SplatUndef.trunc(HalfSize);

	// If the two halves do not match (ignoring undef bits), stop here.
	if ((HighValue & ~LowUndef) != (LowValue & ~HighUndef) \|\|
	MinSplatBits > HalfSize)
	break;

	SplatValue = HighValue \| LowValue;
	SplatUndef = HighUndef & LowUndef;

	VecWidth = HalfSize;
	}

	SplatBitSize = VecWidth;
	return true;
	}

	SDValue BuildVectorSDNode::getSplatValue(BitVector *UndefElements) const {
	if (UndefElements) {
	UndefElements->clear();
	UndefElements->resize(getNumOperands());
	}
	SDValue Splatted;
	for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
	SDValue Op = getOperand(i);
	if (Op.isUndef()) {
	if (UndefElements)
	(*UndefElements)[i] = true;
	} else if (!Splatted) {
	Splatted = Op;
	} else if (Splatted != Op) {
	return SDValue();
	}
	}

	if (!Splatted) {
	assert(getOperand(0).isUndef() &&
	"Can only have a splat without a constant for all undefs.");
	return getOperand(0);
	}

	return Splatted;
	}

	ConstantSDNode *
	BuildVectorSDNode::getConstantSplatNode(BitVector *UndefElements) const {
	return dyn_cast_or_null<ConstantSDNode>(getSplatValue(UndefElements));
	}

	ConstantFPSDNode *
	BuildVectorSDNode::getConstantFPSplatNode(BitVector *UndefElements) const {
	return dyn_cast_or_null<ConstantFPSDNode>(getSplatValue(UndefElements));
	}

	int32_t
	BuildVectorSDNode::getConstantFPSplatPow2ToLog2Int(BitVector *UndefElements,
	uint32_t BitWidth) const {
	if (ConstantFPSDNode *CN =
	dyn_cast_or_null<ConstantFPSDNode>(getSplatValue(UndefElements))) {
	bool IsExact;
	APSInt IntVal(BitWidth);
	const APFloat &APF = CN->getValueAPF();
	if (APF.convertToInteger(IntVal, APFloat::rmTowardZero, &IsExact) !=
	APFloat::opOK \|\|
	!IsExact)
	return -1;

	return IntVal.exactLogBase2();
	}
	return -1;
	}

	bool BuildVectorSDNode::isConstant() const {
	for (const SDValue &Op : op_values()) {
	unsigned Opc = Op.getOpcode();
	if (Opc != ISD::UNDEF && Opc != ISD::Constant && Opc != ISD::ConstantFP)
	return false;
	}
	return true;
	}

	bool ShuffleVectorSDNode::isSplatMask(const int *Mask, EVT VT) {
	// Find the first non-undef value in the shuffle mask.
	unsigned i, e;
	for (i = 0, e = VT.getVectorNumElements(); i != e && Mask[i] < 0; ++i)
	/* search */;

	assert(i != e && "VECTOR_SHUFFLE node with all undef indices!");

	// Make sure all remaining elements are either undef or the same as the first
	// non-undef value.
	for (int Idx = Mask[i]; i != e; ++i)
	if (Mask[i] >= 0 && Mask[i] != Idx)
	return false;
	return true;
	}

	// \brief Returns the SDNode if it is a constant integer BuildVector
	// or constant integer.
	SDNode *SelectionDAG::isConstantIntBuildVectorOrConstantInt(SDValue N) {
	if (isa<ConstantSDNode>(N))
	return N.getNode();
	if (ISD::isBuildVectorOfConstantSDNodes(N.getNode()))
	return N.getNode();
	// Treat a GlobalAddress supporting constant offset folding as a
	// constant integer.
	if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(N))
	if (GA->getOpcode() == ISD::GlobalAddress &&
	TLI->isOffsetFoldingLegal(GA))
	return GA;
	return nullptr;
	}

	SDNode *SelectionDAG::isConstantFPBuildVectorOrConstantFP(SDValue N) {
	if (isa<ConstantFPSDNode>(N))
	return N.getNode();

	if (ISD::isBuildVectorOfConstantFPSDNodes(N.getNode()))
	return N.getNode();

	return nullptr;
	}

	#ifndef NDEBUG
	static void checkForCyclesHelper(const SDNode *N,
	SmallPtrSetImpl<const SDNode*> &Visited,
	SmallPtrSetImpl<const SDNode*> &Checked,
	const llvm::SelectionDAG *DAG) {
	// If this node has already been checked, don't check it again.
	if (Checked.count(N))
	return;

	// If a node has already been visited on this depth-first walk, reject it as
	// a cycle.
	if (!Visited.insert(N).second) {
	errs() << "Detected cycle in SelectionDAG\n";
	dbgs() << "Offending node:\n";
	N->dumprFull(DAG); dbgs() << "\n";
	abort();
	}

	for (const SDValue &Op : N->op_values())
	checkForCyclesHelper(Op.getNode(), Visited, Checked, DAG);

	Checked.insert(N);
	Visited.erase(N);
	}
	#endif

	void llvm::checkForCycles(const llvm::SDNode *N,
	const llvm::SelectionDAG *DAG,
	bool force) {
	#ifndef NDEBUG
	bool check = force;
	#ifdef EXPENSIVE_CHECKS
	check = true;
	#endif // EXPENSIVE_CHECKS
	if (check) {
	assert(N && "Checking nonexistent SDNode");
	SmallPtrSet<const SDNode*, 32> visited;
	SmallPtrSet<const SDNode*, 32> checked;
	checkForCyclesHelper(N, visited, checked, DAG);
	}
	#endif // !NDEBUG
	}

	void llvm::checkForCycles(const llvm::SelectionDAG *DAG, bool force) {
	checkForCycles(DAG->getRoot().getNode(), DAG, force);
	}
	diff --git a/lib/CodeGen/VirtRegMap.cpp b/lib/CodeGen/VirtRegMap.cpp
	index 124c2790f68c..f8aacdb8649d 100644
	--- a/lib/CodeGen/VirtRegMap.cpp
	+++ b/lib/CodeGen/VirtRegMap.cpp
	@@ -1,515 +1,543 @@
	//===-- llvm/CodeGen/VirtRegMap.cpp - Virtual Register Map ----------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements the VirtRegMap class.
	//
	// It also contains implementations of the Spiller interface, which, given a
	// virtual register map and a machine function, eliminates all virtual
	// references by replacing them with physical register references - adding spill
	// code as necessary.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/CodeGen/VirtRegMap.h"
	#include "LiveDebugVariables.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/CodeGen/LiveIntervalAnalysis.h"
	#include "llvm/CodeGen/LiveStackAnalysis.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/Passes.h"
	#include "llvm/IR/Function.h"
	#include "llvm/Support/Compiler.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Target/TargetInstrInfo.h"
	#include "llvm/Target/TargetMachine.h"
	#include "llvm/Target/TargetRegisterInfo.h"
	#include "llvm/Target/TargetSubtargetInfo.h"
	#include <algorithm>
	using namespace llvm;

	#define DEBUG_TYPE "regalloc"

	STATISTIC(NumSpillSlots, "Number of spill slots allocated");
	STATISTIC(NumIdCopies, "Number of identity moves eliminated after rewriting");

	//===----------------------------------------------------------------------===//
	// VirtRegMap implementation
	//===----------------------------------------------------------------------===//

	char VirtRegMap::ID = 0;

	INITIALIZE_PASS(VirtRegMap, "virtregmap", "Virtual Register Map", false, false)

	bool VirtRegMap::runOnMachineFunction(MachineFunction &mf) {
	MRI = &mf.getRegInfo();
	TII = mf.getSubtarget().getInstrInfo();
	TRI = mf.getSubtarget().getRegisterInfo();
	MF = &mf;

	Virt2PhysMap.clear();
	Virt2StackSlotMap.clear();
	Virt2SplitMap.clear();

	grow();
	return false;
	}

	void VirtRegMap::grow() {
	unsigned NumRegs = MF->getRegInfo().getNumVirtRegs();
	Virt2PhysMap.resize(NumRegs);
	Virt2StackSlotMap.resize(NumRegs);
	Virt2SplitMap.resize(NumRegs);
	}

	void VirtRegMap::assignVirt2Phys(unsigned virtReg, MCPhysReg physReg) {
	assert(TargetRegisterInfo::isVirtualRegister(virtReg) &&
	TargetRegisterInfo::isPhysicalRegister(physReg));
	assert(Virt2PhysMap[virtReg] == NO_PHYS_REG &&
	"attempt to assign physical register to already mapped "
	"virtual register");
	assert(!getRegInfo().isReserved(physReg) &&
	"Attempt to map virtReg to a reserved physReg");
	Virt2PhysMap[virtReg] = physReg;
	}

	unsigned VirtRegMap::createSpillSlot(const TargetRegisterClass *RC) {
	unsigned Size = TRI->getSpillSize(*RC);
	unsigned Align = TRI->getSpillAlignment(*RC);
	int SS = MF->getFrameInfo().CreateSpillStackObject(Size, Align);
	++NumSpillSlots;
	return SS;
	}

	bool VirtRegMap::hasPreferredPhys(unsigned VirtReg) {
	unsigned Hint = MRI->getSimpleHint(VirtReg);
	if (!Hint)
	return false;
	if (TargetRegisterInfo::isVirtualRegister(Hint))
	Hint = getPhys(Hint);
	return getPhys(VirtReg) == Hint;
	}

	bool VirtRegMap::hasKnownPreference(unsigned VirtReg) {
	std::pair<unsigned, unsigned> Hint = MRI->getRegAllocationHint(VirtReg);
	if (TargetRegisterInfo::isPhysicalRegister(Hint.second))
	return true;
	if (TargetRegisterInfo::isVirtualRegister(Hint.second))
	return hasPhys(Hint.second);
	return false;
	}

	int VirtRegMap::assignVirt2StackSlot(unsigned virtReg) {
	assert(TargetRegisterInfo::isVirtualRegister(virtReg));
	assert(Virt2StackSlotMap[virtReg] == NO_STACK_SLOT &&
	"attempt to assign stack slot to already spilled register");
	const TargetRegisterClass* RC = MF->getRegInfo().getRegClass(virtReg);
	return Virt2StackSlotMap[virtReg] = createSpillSlot(RC);
	}

	void VirtRegMap::assignVirt2StackSlot(unsigned virtReg, int SS) {
	assert(TargetRegisterInfo::isVirtualRegister(virtReg));
	assert(Virt2StackSlotMap[virtReg] == NO_STACK_SLOT &&
	"attempt to assign stack slot to already spilled register");
	assert((SS >= 0 \|\|
	(SS >= MF->getFrameInfo().getObjectIndexBegin())) &&
	"illegal fixed frame index");
	Virt2StackSlotMap[virtReg] = SS;
	}

	void VirtRegMap::print(raw_ostream &OS, const Module*) const {
	OS << "******** REGISTER MAP ********\n";
	for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
	unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
	if (Virt2PhysMap[Reg] != (unsigned)VirtRegMap::NO_PHYS_REG) {
	OS << '[' << PrintReg(Reg, TRI) << " -> "
	<< PrintReg(Virt2PhysMap[Reg], TRI) << "] "
	<< TRI->getRegClassName(MRI->getRegClass(Reg)) << "\n";
	}
	}

	for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
	unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
	if (Virt2StackSlotMap[Reg] != VirtRegMap::NO_STACK_SLOT) {
	OS << '[' << PrintReg(Reg, TRI) << " -> fi#" << Virt2StackSlotMap[Reg]
	<< "] " << TRI->getRegClassName(MRI->getRegClass(Reg)) << "\n";
	}
	}
	OS << '\n';
	}

	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
	LLVM_DUMP_METHOD void VirtRegMap::dump() const {
	print(dbgs());
	}
	#endif

	//===----------------------------------------------------------------------===//
	// VirtRegRewriter
	//===----------------------------------------------------------------------===//
	//
	// The VirtRegRewriter is the last of the register allocator passes.
	// It rewrites virtual registers to physical registers as specified in the
	// VirtRegMap analysis. It also updates live-in information on basic blocks
	// according to LiveIntervals.
	//
	namespace {
	class VirtRegRewriter : public MachineFunctionPass {
	MachineFunction *MF;
	const TargetMachine *TM;
	const TargetRegisterInfo *TRI;
	const TargetInstrInfo *TII;
	MachineRegisterInfo *MRI;
	SlotIndexes *Indexes;
	LiveIntervals *LIS;
	VirtRegMap *VRM;

	void rewrite();
	void addMBBLiveIns();
	bool readsUndefSubreg(const MachineOperand &MO) const;
	void addLiveInsForSubRanges(const LiveInterval &LI, unsigned PhysReg) const;
	void handleIdentityCopy(MachineInstr &MI) const;
	void expandCopyBundle(MachineInstr &MI) const;
	+ bool subRegLiveThrough(const MachineInstr &MI, unsigned SuperPhysReg) const;

	public:
	static char ID;
	VirtRegRewriter() : MachineFunctionPass(ID) {}

	void getAnalysisUsage(AnalysisUsage &AU) const override;

	bool runOnMachineFunction(MachineFunction&) override;
	MachineFunctionProperties getSetProperties() const override {
	return MachineFunctionProperties().set(
	MachineFunctionProperties::Property::NoVRegs);
	}
	};
	} // end anonymous namespace

	char &llvm::VirtRegRewriterID = VirtRegRewriter::ID;

	INITIALIZE_PASS_BEGIN(VirtRegRewriter, "virtregrewriter",
	"Virtual Register Rewriter", false, false)
	INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
	INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
	INITIALIZE_PASS_DEPENDENCY(LiveDebugVariables)
	INITIALIZE_PASS_DEPENDENCY(LiveStacks)
	INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
	INITIALIZE_PASS_END(VirtRegRewriter, "virtregrewriter",
	"Virtual Register Rewriter", false, false)

	char VirtRegRewriter::ID = 0;

	void VirtRegRewriter::getAnalysisUsage(AnalysisUsage &AU) const {
	AU.setPreservesCFG();
	AU.addRequired<LiveIntervals>();
	AU.addRequired<SlotIndexes>();
	AU.addPreserved<SlotIndexes>();
	AU.addRequired<LiveDebugVariables>();
	AU.addRequired<LiveStacks>();
	AU.addPreserved<LiveStacks>();
	AU.addRequired<VirtRegMap>();
	MachineFunctionPass::getAnalysisUsage(AU);
	}

	bool VirtRegRewriter::runOnMachineFunction(MachineFunction &fn) {
	MF = &fn;
	TM = &MF->getTarget();
	TRI = MF->getSubtarget().getRegisterInfo();
	TII = MF->getSubtarget().getInstrInfo();
	MRI = &MF->getRegInfo();
	Indexes = &getAnalysis<SlotIndexes>();
	LIS = &getAnalysis<LiveIntervals>();
	VRM = &getAnalysis<VirtRegMap>();
	DEBUG(dbgs() << "******** REWRITE VIRTUAL REGISTERS ********\n"
	<< "********** Function: "
	<< MF->getName() << '\n');
	DEBUG(VRM->dump());

	// Add kill flags while we still have virtual registers.
	LIS->addKillFlags(VRM);

	// Live-in lists on basic blocks are required for physregs.
	addMBBLiveIns();

	// Rewrite virtual registers.
	rewrite();

	// Write out new DBG_VALUE instructions.
	getAnalysis<LiveDebugVariables>().emitDebugValues(VRM);

	// All machine operands and other references to virtual registers have been
	// replaced. Remove the virtual registers and release all the transient data.
	VRM->clearAllVirt();
	MRI->clearVirtRegs();
	return true;
	}

	void VirtRegRewriter::addLiveInsForSubRanges(const LiveInterval &LI,
	unsigned PhysReg) const {
	assert(!LI.empty());
	assert(LI.hasSubRanges());

	typedef std::pair<const LiveInterval::SubRange *,
	LiveInterval::const_iterator> SubRangeIteratorPair;
	SmallVector<SubRangeIteratorPair, 4> SubRanges;
	SlotIndex First;
	SlotIndex Last;
	for (const LiveInterval::SubRange &SR : LI.subranges()) {
	SubRanges.push_back(std::make_pair(&SR, SR.begin()));
	if (!First.isValid() \|\| SR.segments.front().start < First)
	First = SR.segments.front().start;
	if (!Last.isValid() \|\| SR.segments.back().end > Last)
	Last = SR.segments.back().end;
	}

	// Check all mbb start positions between First and Last while
	// simulatenously advancing an iterator for each subrange.
	for (SlotIndexes::MBBIndexIterator MBBI = Indexes->findMBBIndex(First);
	MBBI != Indexes->MBBIndexEnd() && MBBI->first <= Last; ++MBBI) {
	SlotIndex MBBBegin = MBBI->first;
	// Advance all subrange iterators so that their end position is just
	// behind MBBBegin (or the iterator is at the end).
	LaneBitmask LaneMask;
	for (auto &RangeIterPair : SubRanges) {
	const LiveInterval::SubRange *SR = RangeIterPair.first;
	LiveInterval::const_iterator &SRI = RangeIterPair.second;
	while (SRI != SR->end() && SRI->end <= MBBBegin)
	++SRI;
	if (SRI == SR->end())
	continue;
	if (SRI->start <= MBBBegin)
	LaneMask \|= SR->LaneMask;
	}
	if (LaneMask.none())
	continue;
	MachineBasicBlock *MBB = MBBI->second;
	MBB->addLiveIn(PhysReg, LaneMask);
	}
	}

	// Compute MBB live-in lists from virtual register live ranges and their
	// assignments.
	void VirtRegRewriter::addMBBLiveIns() {
	for (unsigned Idx = 0, IdxE = MRI->getNumVirtRegs(); Idx != IdxE; ++Idx) {
	unsigned VirtReg = TargetRegisterInfo::index2VirtReg(Idx);
	if (MRI->reg_nodbg_empty(VirtReg))
	continue;
	LiveInterval &LI = LIS->getInterval(VirtReg);
	if (LI.empty() \|\| LIS->intervalIsInOneMBB(LI))
	continue;
	// This is a virtual register that is live across basic blocks. Its
	// assigned PhysReg must be marked as live-in to those blocks.
	unsigned PhysReg = VRM->getPhys(VirtReg);
	assert(PhysReg != VirtRegMap::NO_PHYS_REG && "Unmapped virtual register.");

	if (LI.hasSubRanges()) {
	addLiveInsForSubRanges(LI, PhysReg);
	} else {
	// Go over MBB begin positions and see if we have segments covering them.
	// The following works because segments and the MBBIndex list are both
	// sorted by slot indexes.
	SlotIndexes::MBBIndexIterator I = Indexes->MBBIndexBegin();
	for (const auto &Seg : LI) {
	I = Indexes->advanceMBBIndex(I, Seg.start);
	for (; I != Indexes->MBBIndexEnd() && I->first < Seg.end; ++I) {
	MachineBasicBlock *MBB = I->second;
	MBB->addLiveIn(PhysReg);
	}
	}
	}
	}

	// Sort and unique MBB LiveIns as we've not checked if SubReg/PhysReg were in
	// each MBB's LiveIns set before calling addLiveIn on them.
	for (MachineBasicBlock &MBB : *MF)
	MBB.sortUniqueLiveIns();
	}

	/// Returns true if the given machine operand \p MO only reads undefined lanes.
	/// The function only works for use operands with a subregister set.
	bool VirtRegRewriter::readsUndefSubreg(const MachineOperand &MO) const {
	// Shortcut if the operand is already marked undef.
	if (MO.isUndef())
	return true;

	unsigned Reg = MO.getReg();
	const LiveInterval &LI = LIS->getInterval(Reg);
	const MachineInstr &MI = *MO.getParent();
	SlotIndex BaseIndex = LIS->getInstructionIndex(MI);
	// This code is only meant to handle reading undefined subregisters which
	// we couldn't properly detect before.
	assert(LI.liveAt(BaseIndex) &&
	"Reads of completely dead register should be marked undef already");
	unsigned SubRegIdx = MO.getSubReg();
	assert(SubRegIdx != 0 && LI.hasSubRanges());
	LaneBitmask UseMask = TRI->getSubRegIndexLaneMask(SubRegIdx);
	// See if any of the relevant subregister liveranges is defined at this point.
	for (const LiveInterval::SubRange &SR : LI.subranges()) {
	if ((SR.LaneMask & UseMask).any() && SR.liveAt(BaseIndex))
	return false;
	}
	return true;
	}

	void VirtRegRewriter::handleIdentityCopy(MachineInstr &MI) const {
	if (!MI.isIdentityCopy())
	return;
	DEBUG(dbgs() << "Identity copy: " << MI);
	++NumIdCopies;

	// Copies like:
	// %R0 = COPY %R0<undef>
	// %AL = COPY %AL, %EAX<imp-def>
	// give us additional liveness information: The target (super-)register
	// must not be valid before this point. Replace the COPY with a KILL
	// instruction to maintain this information.
	if (MI.getOperand(0).isUndef() \|\| MI.getNumOperands() > 2) {
	MI.setDesc(TII->get(TargetOpcode::KILL));
	DEBUG(dbgs() << " replace by: " << MI);
	return;
	}

	if (Indexes)
	Indexes->removeSingleMachineInstrFromMaps(MI);
	MI.eraseFromBundle();
	DEBUG(dbgs() << " deleted.\n");
	}

	/// The liverange splitting logic sometimes produces bundles of copies when
	/// subregisters are involved. Expand these into a sequence of copy instructions
	/// after processing the last in the bundle. Does not update LiveIntervals
	/// which we shouldn't need for this instruction anymore.
	void VirtRegRewriter::expandCopyBundle(MachineInstr &MI) const {
	if (!MI.isCopy())
	return;

	if (MI.isBundledWithPred() && !MI.isBundledWithSucc()) {
	// Only do this when the complete bundle is made out of COPYs.
	MachineBasicBlock &MBB = *MI.getParent();
	for (MachineBasicBlock::reverse_instr_iterator I =
	std::next(MI.getReverseIterator()), E = MBB.instr_rend();
	I != E && I->isBundledWithSucc(); ++I) {
	if (!I->isCopy())
	return;
	}

	for (MachineBasicBlock::reverse_instr_iterator I = MI.getReverseIterator();
	I->isBundledWithPred(); ) {
	MachineInstr &MI = *I;
	++I;

	MI.unbundleFromPred();
	if (Indexes)
	Indexes->insertMachineInstrInMaps(MI);
	}
	}
	}

	+/// Check whether (part of) \p SuperPhysReg is live through \p MI.
	+/// \pre \p MI defines a subregister of a virtual register that
	+/// has been assigned to \p SuperPhysReg.
	+bool VirtRegRewriter::subRegLiveThrough(const MachineInstr &MI,
	+ unsigned SuperPhysReg) const {
	+ SlotIndex MIIndex = LIS->getInstructionIndex(MI);
	+ SlotIndex BeforeMIUses = MIIndex.getBaseIndex();
	+ SlotIndex AfterMIDefs = MIIndex.getBoundaryIndex();
	+ for (MCRegUnitIterator Unit(SuperPhysReg, TRI); Unit.isValid(); ++Unit) {
	+ const LiveRange &UnitRange = LIS->getRegUnit(*Unit);
	+ // If the regunit is live both before and after MI,
	+ // we assume it is live through.
	+ // Generally speaking, this is not true, because something like
	+ // "RU = op RU" would match that description.
	+ // However, we know that we are trying to assess whether
	+ // a def of a virtual reg, vreg, is live at the same time of RU.
	+ // If we are in the "RU = op RU" situation, that means that vreg
	+ // is defined at the same time as RU (i.e., "vreg, RU = op RU").
	+ // Thus, vreg and RU interferes and vreg cannot be assigned to
	+ // SuperPhysReg. Therefore, this situation cannot happen.
	+ if (UnitRange.liveAt(AfterMIDefs) && UnitRange.liveAt(BeforeMIUses))
	+ return true;
	+ }
	+ return false;
	+}
	+
	void VirtRegRewriter::rewrite() {
	bool NoSubRegLiveness = !MRI->subRegLivenessEnabled();
	SmallVector<unsigned, 8> SuperDeads;
	SmallVector<unsigned, 8> SuperDefs;
	SmallVector<unsigned, 8> SuperKills;

	for (MachineFunction::iterator MBBI = MF->begin(), MBBE = MF->end();
	MBBI != MBBE; ++MBBI) {
	DEBUG(MBBI->print(dbgs(), Indexes));
	for (MachineBasicBlock::instr_iterator
	MII = MBBI->instr_begin(), MIE = MBBI->instr_end(); MII != MIE;) {
	MachineInstr MI = &MII;
	++MII;

	for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
	MOE = MI->operands_end(); MOI != MOE; ++MOI) {
	MachineOperand &MO = *MOI;

	// Make sure MRI knows about registers clobbered by regmasks.
	if (MO.isRegMask())
	MRI->addPhysRegsUsedFromRegMask(MO.getRegMask());

	if (!MO.isReg() \|\| !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
	continue;
	unsigned VirtReg = MO.getReg();
	unsigned PhysReg = VRM->getPhys(VirtReg);
	assert(PhysReg != VirtRegMap::NO_PHYS_REG &&
	"Instruction uses unmapped VirtReg");
	assert(!MRI->isReserved(PhysReg) && "Reserved register assignment");

	// Preserve semantics of sub-register operands.
	unsigned SubReg = MO.getSubReg();
	if (SubReg != 0) {
	if (NoSubRegLiveness) {
	// A virtual register kill refers to the whole register, so we may
	// have to add <imp-use,kill> operands for the super-register. A
	// partial redef always kills and redefines the super-register.
	- if (MO.readsReg() && (MO.isDef() \|\| MO.isKill()))
	+ if ((MO.readsReg() && (MO.isDef() \|\| MO.isKill())) \|\|
	+ (MO.isDef() && subRegLiveThrough(*MI, PhysReg)))
	SuperKills.push_back(PhysReg);

	if (MO.isDef()) {
	// Also add implicit defs for the super-register.
	if (MO.isDead())
	SuperDeads.push_back(PhysReg);
	else
	SuperDefs.push_back(PhysReg);
	}
	} else {
	if (MO.isUse()) {
	if (readsUndefSubreg(MO))
	// We need to add an <undef> flag if the subregister is
	// completely undefined (and we are not adding super-register
	// defs).
	MO.setIsUndef(true);
	} else if (!MO.isDead()) {
	assert(MO.isDef());
	}
	}

	// The <def,undef> and <def,internal> flags only make sense for
	// sub-register defs, and we are substituting a full physreg. An
	// <imp-use,kill> operand from the SuperKills list will represent the
	// partial read of the super-register.
	if (MO.isDef()) {
	MO.setIsUndef(false);
	MO.setIsInternalRead(false);
	}

	// PhysReg operands cannot have subregister indexes.
	PhysReg = TRI->getSubReg(PhysReg, SubReg);
	assert(PhysReg && "Invalid SubReg for physical register");
	MO.setSubReg(0);
	}
	// Rewrite. Note we could have used MachineOperand::substPhysReg(), but
	// we need the inlining here.
	MO.setReg(PhysReg);
	}

	// Add any missing super-register kills after rewriting the whole
	// instruction.
	while (!SuperKills.empty())
	MI->addRegisterKilled(SuperKills.pop_back_val(), TRI, true);

	while (!SuperDeads.empty())
	MI->addRegisterDead(SuperDeads.pop_back_val(), TRI, true);

	while (!SuperDefs.empty())
	MI->addRegisterDefined(SuperDefs.pop_back_val(), TRI);

	DEBUG(dbgs() << "> " << *MI);

	expandCopyBundle(*MI);

	// We can remove identity copies right now.
	handleIdentityCopy(*MI);
	}
	}
	}
	diff --git a/lib/DebugInfo/DWARF/DWARFContext.cpp b/lib/DebugInfo/DWARF/DWARFContext.cpp
	index 495e09fbae35..dd3235244e24 100644
	--- a/lib/DebugInfo/DWARF/DWARFContext.cpp
	+++ b/lib/DebugInfo/DWARF/DWARFContext.cpp
	@@ -1,1113 +1,1113 @@
	//===- DWARFContext.cpp ---------------------------------------------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/DebugInfo/DWARF/DWARFContext.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SmallString.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/ADT/StringSwitch.h"
	#include "llvm/BinaryFormat/Dwarf.h"
	#include "llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h"
	#include "llvm/DebugInfo/DWARF/DWARFCompileUnit.h"
	#include "llvm/DebugInfo/DWARF/DWARFDebugAbbrev.h"
	#include "llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h"
	#include "llvm/DebugInfo/DWARF/DWARFDebugAranges.h"
	#include "llvm/DebugInfo/DWARF/DWARFDebugFrame.h"
	#include "llvm/DebugInfo/DWARF/DWARFDebugLine.h"
	#include "llvm/DebugInfo/DWARF/DWARFDebugLoc.h"
	#include "llvm/DebugInfo/DWARF/DWARFDebugMacro.h"
	#include "llvm/DebugInfo/DWARF/DWARFDebugPubTable.h"
	#include "llvm/DebugInfo/DWARF/DWARFDebugRangeList.h"
	#include "llvm/DebugInfo/DWARF/DWARFDie.h"
	#include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
	#include "llvm/DebugInfo/DWARF/DWARFGdbIndex.h"
	#include "llvm/DebugInfo/DWARF/DWARFSection.h"
	#include "llvm/DebugInfo/DWARF/DWARFUnitIndex.h"
	#include "llvm/DebugInfo/DWARF/DWARFVerifier.h"
	#include "llvm/Object/Decompressor.h"
	#include "llvm/Object/MachO.h"
	#include "llvm/Object/ObjectFile.h"
	#include "llvm/Object/RelocVisitor.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/DataExtractor.h"
	#include "llvm/Support/Error.h"
	#include "llvm/Support/Format.h"
	#include "llvm/Support/MemoryBuffer.h"
	#include "llvm/Support/raw_ostream.h"
	#include <algorithm>
	#include <cstdint>
	#include <map>
	#include <string>
	#include <tuple>
	#include <utility>
	#include <vector>

	using namespace llvm;
	using namespace dwarf;
	using namespace object;

	#define DEBUG_TYPE "dwarf"

	using DWARFLineTable = DWARFDebugLine::LineTable;
	using FileLineInfoKind = DILineInfoSpecifier::FileLineInfoKind;
	using FunctionNameKind = DILineInfoSpecifier::FunctionNameKind;

	static void dumpAccelSection(raw_ostream &OS, StringRef Name,
	const DWARFSection& Section, StringRef StringSection,
	bool LittleEndian) {
	DWARFDataExtractor AccelSection(Section, LittleEndian, 0);
	DataExtractor StrData(StringSection, LittleEndian, 0);
	OS << "\n." << Name << " contents:\n";
	DWARFAcceleratorTable Accel(AccelSection, StrData);
	if (!Accel.extract())
	return;
	Accel.dump(OS);
	}

	static void
	dumpDWARFv5StringOffsetsSection(raw_ostream &OS, StringRef SectionName,
	const DWARFSection &StringOffsetsSection,
	StringRef StringSection, bool LittleEndian) {
	DWARFDataExtractor StrOffsetExt(StringOffsetsSection, LittleEndian, 0);
	uint32_t Offset = 0;
	uint64_t SectionSize = StringOffsetsSection.Data.size();

	while (Offset < SectionSize) {
	unsigned Version = 0;
	DwarfFormat Format = DWARF32;
	unsigned EntrySize = 4;
	// Perform validation and extract the segment size from the header.
	if (!StrOffsetExt.isValidOffsetForDataOfSize(Offset, 4)) {
	OS << "error: invalid contribution to string offsets table in section ."
	<< SectionName << ".\n";
	return;
	}
	uint32_t ContributionStart = Offset;
	uint64_t ContributionSize = StrOffsetExt.getU32(&Offset);
	// A contribution size of 0xffffffff indicates DWARF64, with the actual size
	// in the following 8 bytes. Otherwise, the DWARF standard mandates that
	// the contribution size must be at most 0xfffffff0.
	if (ContributionSize == 0xffffffff) {
	if (!StrOffsetExt.isValidOffsetForDataOfSize(Offset, 8)) {
	OS << "error: invalid contribution to string offsets table in section ."
	<< SectionName << ".\n";
	return;
	}
	Format = DWARF64;
	EntrySize = 8;
	ContributionSize = StrOffsetExt.getU64(&Offset);
	} else if (ContributionSize > 0xfffffff0) {
	OS << "error: invalid contribution to string offsets table in section ."
	<< SectionName << ".\n";
	return;
	}

	// We must ensure that we don't read a partial record at the end, so we
	// validate for a multiple of EntrySize. Also, we're expecting a version
	// number and padding, which adds an additional 4 bytes.
	uint64_t ValidationSize =
	4 + ((ContributionSize + EntrySize - 1) & (-(uint64_t)EntrySize));
	if (!StrOffsetExt.isValidOffsetForDataOfSize(Offset, ValidationSize)) {
	OS << "error: contribution to string offsets table in section ."
	<< SectionName << " has invalid length.\n";
	return;
	}

	Version = StrOffsetExt.getU16(&Offset);
	Offset += 2;
	OS << format("0x%8.8x: ", ContributionStart);
	OS << "Contribution size = " << ContributionSize
	<< ", Version = " << Version << "\n";

	uint32_t ContributionBase = Offset;
	DataExtractor StrData(StringSection, LittleEndian, 0);
	while (Offset - ContributionBase < ContributionSize) {
	OS << format("0x%8.8x: ", Offset);
	// FIXME: We can only extract strings in DWARF32 format at the moment.
	uint64_t StringOffset =
	StrOffsetExt.getRelocatedValue(EntrySize, &Offset);
	if (Format == DWARF32) {
	- OS << format("%8.8x ", StringOffset);
	uint32_t StringOffset32 = (uint32_t)StringOffset;
	+ OS << format("%8.8x ", StringOffset32);
	const char *S = StrData.getCStr(&StringOffset32);
	if (S)
	OS << format("\"%s\"", S);
	} else
	- OS << format("%16.16x ", StringOffset);
	+ OS << format("%16.16" PRIx64 " ", StringOffset);
	OS << "\n";
	}
	}
	}

	// Dump a DWARF string offsets section. This may be a DWARF v5 formatted
	// string offsets section, where each compile or type unit contributes a
	// number of entries (string offsets), with each contribution preceded by
	// a header containing size and version number. Alternatively, it may be a
	// monolithic series of string offsets, as generated by the pre-DWARF v5
	// implementation of split DWARF.
	static void dumpStringOffsetsSection(raw_ostream &OS, StringRef SectionName,
	const DWARFSection &StringOffsetsSection,
	StringRef StringSection, bool LittleEndian,
	unsigned MaxVersion) {
	if (StringOffsetsSection.Data.empty())
	return;
	OS << "\n." << SectionName << " contents:\n";
	// If we have at least one (compile or type) unit with DWARF v5 or greater,
	// we assume that the section is formatted like a DWARF v5 string offsets
	// section.
	if (MaxVersion >= 5)
	dumpDWARFv5StringOffsetsSection(OS, SectionName, StringOffsetsSection,
	StringSection, LittleEndian);
	else {
	DataExtractor strOffsetExt(StringOffsetsSection.Data, LittleEndian, 0);
	uint32_t offset = 0;
	uint64_t size = StringOffsetsSection.Data.size();
	// Ensure that size is a multiple of the size of an entry.
	if (size & ((uint64_t)(sizeof(uint32_t) - 1))) {
	OS << "error: size of ." << SectionName << " is not a multiple of "
	<< sizeof(uint32_t) << ".\n";
	size &= -(uint64_t)sizeof(uint32_t);
	}
	DataExtractor StrData(StringSection, LittleEndian, 0);
	while (offset < size) {
	OS << format("0x%8.8x: ", offset);
	uint32_t StringOffset = strOffsetExt.getU32(&offset);
	OS << format("%8.8x ", StringOffset);
	const char *S = StrData.getCStr(&StringOffset);
	if (S)
	OS << format("\"%s\"", S);
	OS << "\n";
	}
	}
	}

	void DWARFContext::dump(raw_ostream &OS, DIDumpOptions DumpOpts) {
	DIDumpType DumpType = DumpOpts.DumpType;
	bool DumpEH = DumpOpts.DumpEH;
	bool SummarizeTypes = DumpOpts.SummarizeTypes;

	if (DumpType == DIDT_All \|\| DumpType == DIDT_Abbrev) {
	OS << ".debug_abbrev contents:\n";
	getDebugAbbrev()->dump(OS);
	}

	if (DumpType == DIDT_All \|\| DumpType == DIDT_AbbrevDwo)
	if (const DWARFDebugAbbrev *D = getDebugAbbrevDWO()) {
	OS << "\n.debug_abbrev.dwo contents:\n";
	D->dump(OS);
	}

	if (DumpType == DIDT_All \|\| DumpType == DIDT_Info) {
	OS << "\n.debug_info contents:\n";
	for (const auto &CU : compile_units())
	CU->dump(OS, DumpOpts);
	}

	if ((DumpType == DIDT_All \|\| DumpType == DIDT_InfoDwo) &&
	getNumDWOCompileUnits()) {
	OS << "\n.debug_info.dwo contents:\n";
	for (const auto &DWOCU : dwo_compile_units())
	DWOCU->dump(OS, DumpOpts);
	}

	if ((DumpType == DIDT_All \|\| DumpType == DIDT_Types) && getNumTypeUnits()) {
	OS << "\n.debug_types contents:\n";
	for (const auto &TUS : type_unit_sections())
	for (const auto &TU : TUS)
	TU->dump(OS, SummarizeTypes);
	}

	if ((DumpType == DIDT_All \|\| DumpType == DIDT_TypesDwo) &&
	getNumDWOTypeUnits()) {
	OS << "\n.debug_types.dwo contents:\n";
	for (const auto &DWOTUS : dwo_type_unit_sections())
	for (const auto &DWOTU : DWOTUS)
	DWOTU->dump(OS, SummarizeTypes);
	}

	if (DumpType == DIDT_All \|\| DumpType == DIDT_Loc) {
	OS << "\n.debug_loc contents:\n";
	getDebugLoc()->dump(OS);
	}

	if (DumpType == DIDT_All \|\| DumpType == DIDT_LocDwo) {
	OS << "\n.debug_loc.dwo contents:\n";
	getDebugLocDWO()->dump(OS);
	}

	if (DumpType == DIDT_All \|\| DumpType == DIDT_Frames) {
	OS << "\n.debug_frame contents:\n";
	getDebugFrame()->dump(OS);
	if (DumpEH) {
	OS << "\n.eh_frame contents:\n";
	getEHFrame()->dump(OS);
	}
	}

	if (DumpType == DIDT_All \|\| DumpType == DIDT_Macro) {
	OS << "\n.debug_macinfo contents:\n";
	getDebugMacro()->dump(OS);
	}

	uint32_t offset = 0;
	if (DumpType == DIDT_All \|\| DumpType == DIDT_Aranges) {
	OS << "\n.debug_aranges contents:\n";
	DataExtractor arangesData(getARangeSection(), isLittleEndian(), 0);
	DWARFDebugArangeSet set;
	while (set.extract(arangesData, &offset))
	set.dump(OS);
	}

	uint8_t savedAddressByteSize = 0;
	if (DumpType == DIDT_All \|\| DumpType == DIDT_Line) {
	OS << "\n.debug_line contents:\n";
	for (const auto &CU : compile_units()) {
	savedAddressByteSize = CU->getAddressByteSize();
	auto CUDIE = CU->getUnitDIE();
	if (!CUDIE)
	continue;
	if (auto StmtOffset = toSectionOffset(CUDIE.find(DW_AT_stmt_list))) {
	DWARFDataExtractor lineData(getLineSection(), isLittleEndian(),
	savedAddressByteSize);
	DWARFDebugLine::LineTable LineTable;
	uint32_t Offset = *StmtOffset;
	LineTable.parse(lineData, &Offset);
	LineTable.dump(OS);
	}
	}
	}

	if (DumpType == DIDT_All \|\| DumpType == DIDT_CUIndex) {
	OS << "\n.debug_cu_index contents:\n";
	getCUIndex().dump(OS);
	}

	if (DumpType == DIDT_All \|\| DumpType == DIDT_TUIndex) {
	OS << "\n.debug_tu_index contents:\n";
	getTUIndex().dump(OS);
	}

	if (DumpType == DIDT_All \|\| DumpType == DIDT_LineDwo) {
	OS << "\n.debug_line.dwo contents:\n";
	unsigned stmtOffset = 0;
	DWARFDataExtractor lineData(getLineDWOSection(), isLittleEndian(),
	savedAddressByteSize);
	DWARFDebugLine::LineTable LineTable;
	while (LineTable.Prologue.parse(lineData, &stmtOffset)) {
	LineTable.dump(OS);
	LineTable.clear();
	}
	}

	if (DumpType == DIDT_All \|\| DumpType == DIDT_Str) {
	OS << "\n.debug_str contents:\n";
	DataExtractor strData(getStringSection(), isLittleEndian(), 0);
	offset = 0;
	uint32_t strOffset = 0;
	while (const char *s = strData.getCStr(&offset)) {
	OS << format("0x%8.8x: \"%s\"\n", strOffset, s);
	strOffset = offset;
	}
	}

	if ((DumpType == DIDT_All \|\| DumpType == DIDT_StrDwo) &&
	!getStringDWOSection().empty()) {
	OS << "\n.debug_str.dwo contents:\n";
	DataExtractor strDWOData(getStringDWOSection(), isLittleEndian(), 0);
	offset = 0;
	uint32_t strDWOOffset = 0;
	while (const char *s = strDWOData.getCStr(&offset)) {
	OS << format("0x%8.8x: \"%s\"\n", strDWOOffset, s);
	strDWOOffset = offset;
	}
	}

	if (DumpType == DIDT_All \|\| DumpType == DIDT_Ranges) {
	OS << "\n.debug_ranges contents:\n";
	// In fact, different compile units may have different address byte
	// sizes, but for simplicity we just use the address byte size of the last
	// compile unit (there is no easy and fast way to associate address range
	// list and the compile unit it describes).
	DWARFDataExtractor rangesData(getRangeSection(), isLittleEndian(),
	savedAddressByteSize);
	offset = 0;
	DWARFDebugRangeList rangeList;
	while (rangeList.extract(rangesData, &offset))
	rangeList.dump(OS);
	}

	if (DumpType == DIDT_All \|\| DumpType == DIDT_Pubnames)
	DWARFDebugPubTable(getPubNamesSection(), isLittleEndian(), false)
	.dump("debug_pubnames", OS);

	if (DumpType == DIDT_All \|\| DumpType == DIDT_Pubtypes)
	DWARFDebugPubTable(getPubTypesSection(), isLittleEndian(), false)
	.dump("debug_pubtypes", OS);

	if (DumpType == DIDT_All \|\| DumpType == DIDT_GnuPubnames)
	DWARFDebugPubTable(getGnuPubNamesSection(), isLittleEndian(),
	true /* GnuStyle */)
	.dump("debug_gnu_pubnames", OS);

	if (DumpType == DIDT_All \|\| DumpType == DIDT_GnuPubtypes)
	DWARFDebugPubTable(getGnuPubTypesSection(), isLittleEndian(),
	true /* GnuStyle */)
	.dump("debug_gnu_pubtypes", OS);

	if (DumpType == DIDT_All \|\| DumpType == DIDT_StrOffsets)
	dumpStringOffsetsSection(OS, "debug_str_offsets", getStringOffsetSection(),
	getStringSection(), isLittleEndian(),
	getMaxVersion());

	if (DumpType == DIDT_All \|\| DumpType == DIDT_StrOffsetsDwo) {
	dumpStringOffsetsSection(OS, "debug_str_offsets.dwo",
	getStringOffsetDWOSection(), getStringDWOSection(),
	isLittleEndian(), getMaxVersion());
	}

	if ((DumpType == DIDT_All \|\| DumpType == DIDT_GdbIndex) &&
	!getGdbIndexSection().empty()) {
	OS << "\n.gnu_index contents:\n";
	getGdbIndex().dump(OS);
	}

	if (DumpType == DIDT_All \|\| DumpType == DIDT_AppleNames)
	dumpAccelSection(OS, "apple_names", getAppleNamesSection(),
	getStringSection(), isLittleEndian());

	if (DumpType == DIDT_All \|\| DumpType == DIDT_AppleTypes)
	dumpAccelSection(OS, "apple_types", getAppleTypesSection(),
	getStringSection(), isLittleEndian());

	if (DumpType == DIDT_All \|\| DumpType == DIDT_AppleNamespaces)
	dumpAccelSection(OS, "apple_namespaces", getAppleNamespacesSection(),
	getStringSection(), isLittleEndian());

	if (DumpType == DIDT_All \|\| DumpType == DIDT_AppleObjC)
	dumpAccelSection(OS, "apple_objc", getAppleObjCSection(),
	getStringSection(), isLittleEndian());
	}

	DWARFCompileUnit *DWARFContext::getDWOCompileUnitForHash(uint64_t Hash) {
	// FIXME: Improve this for the case where this DWO file is really a DWP file
	// with an index - use the index for lookup instead of a linear search.
	for (const auto &DWOCU : dwo_compile_units())
	if (DWOCU->getDWOId() == Hash)
	return DWOCU.get();
	return nullptr;
	}

	DWARFDie DWARFContext::getDIEForOffset(uint32_t Offset) {
	parseCompileUnits();
	if (auto *CU = CUs.getUnitForOffset(Offset))
	return CU->getDIEForOffset(Offset);
	return DWARFDie();
	}

	bool DWARFContext::verify(raw_ostream &OS, DIDumpType DumpType) {
	bool Success = true;
	DWARFVerifier verifier(OS, *this);
	if (DumpType == DIDT_All \|\| DumpType == DIDT_Info) {
	if (!verifier.handleDebugInfo())
	Success = false;
	}
	if (DumpType == DIDT_All \|\| DumpType == DIDT_Line) {
	if (!verifier.handleDebugLine())
	Success = false;
	}
	if (DumpType == DIDT_All \|\| DumpType == DIDT_AppleNames) {
	if (!verifier.handleAppleNames())
	Success = false;
	}
	return Success;
	}

	const DWARFUnitIndex &DWARFContext::getCUIndex() {
	if (CUIndex)
	return *CUIndex;

	DataExtractor CUIndexData(getCUIndexSection(), isLittleEndian(), 0);

	CUIndex = llvm::make_unique<DWARFUnitIndex>(DW_SECT_INFO);
	CUIndex->parse(CUIndexData);
	return *CUIndex;
	}

	const DWARFUnitIndex &DWARFContext::getTUIndex() {
	if (TUIndex)
	return *TUIndex;

	DataExtractor TUIndexData(getTUIndexSection(), isLittleEndian(), 0);

	TUIndex = llvm::make_unique<DWARFUnitIndex>(DW_SECT_TYPES);
	TUIndex->parse(TUIndexData);
	return *TUIndex;
	}

	DWARFGdbIndex &DWARFContext::getGdbIndex() {
	if (GdbIndex)
	return *GdbIndex;

	DataExtractor GdbIndexData(getGdbIndexSection(), true /LE/, 0);
	GdbIndex = llvm::make_unique<DWARFGdbIndex>();
	GdbIndex->parse(GdbIndexData);
	return *GdbIndex;
	}

	const DWARFDebugAbbrev *DWARFContext::getDebugAbbrev() {
	if (Abbrev)
	return Abbrev.get();

	DataExtractor abbrData(getAbbrevSection(), isLittleEndian(), 0);

	Abbrev.reset(new DWARFDebugAbbrev());
	Abbrev->extract(abbrData);
	return Abbrev.get();
	}

	const DWARFDebugAbbrev *DWARFContext::getDebugAbbrevDWO() {
	if (AbbrevDWO)
	return AbbrevDWO.get();

	DataExtractor abbrData(getAbbrevDWOSection(), isLittleEndian(), 0);
	AbbrevDWO.reset(new DWARFDebugAbbrev());
	AbbrevDWO->extract(abbrData);
	return AbbrevDWO.get();
	}

	const DWARFDebugLoc *DWARFContext::getDebugLoc() {
	if (Loc)
	return Loc.get();

	Loc.reset(new DWARFDebugLoc);
	// assume all compile units have the same address byte size
	if (getNumCompileUnits()) {
	DWARFDataExtractor LocData(getLocSection(), isLittleEndian(),
	getCompileUnitAtIndex(0)->getAddressByteSize());
	Loc->parse(LocData);
	}
	return Loc.get();
	}

	const DWARFDebugLocDWO *DWARFContext::getDebugLocDWO() {
	if (LocDWO)
	return LocDWO.get();

	DataExtractor LocData(getLocDWOSection().Data, isLittleEndian(), 0);
	LocDWO.reset(new DWARFDebugLocDWO());
	LocDWO->parse(LocData);
	return LocDWO.get();
	}

	const DWARFDebugAranges *DWARFContext::getDebugAranges() {
	if (Aranges)
	return Aranges.get();

	Aranges.reset(new DWARFDebugAranges());
	Aranges->generate(this);
	return Aranges.get();
	}

	const DWARFDebugFrame *DWARFContext::getDebugFrame() {
	if (DebugFrame)
	return DebugFrame.get();

	// There's a "bug" in the DWARFv3 standard with respect to the target address
	// size within debug frame sections. While DWARF is supposed to be independent
	// of its container, FDEs have fields with size being "target address size",
	// which isn't specified in DWARF in general. It's only specified for CUs, but
	// .eh_frame can appear without a .debug_info section. Follow the example of
	// other tools (libdwarf) and extract this from the container (ObjectFile
	// provides this information). This problem is fixed in DWARFv4
	// See this dwarf-discuss discussion for more details:
	// http://lists.dwarfstd.org/htdig.cgi/dwarf-discuss-dwarfstd.org/2011-December/001173.html
	DataExtractor debugFrameData(getDebugFrameSection(), isLittleEndian(),
	getAddressSize());
	DebugFrame.reset(new DWARFDebugFrame(false /* IsEH */));
	DebugFrame->parse(debugFrameData);
	return DebugFrame.get();
	}

	const DWARFDebugFrame *DWARFContext::getEHFrame() {
	if (EHFrame)
	return EHFrame.get();

	DataExtractor debugFrameData(getEHFrameSection(), isLittleEndian(),
	getAddressSize());
	DebugFrame.reset(new DWARFDebugFrame(true /* IsEH */));
	DebugFrame->parse(debugFrameData);
	return DebugFrame.get();
	}

	const DWARFDebugMacro *DWARFContext::getDebugMacro() {
	if (Macro)
	return Macro.get();

	DataExtractor MacinfoData(getMacinfoSection(), isLittleEndian(), 0);
	Macro.reset(new DWARFDebugMacro());
	Macro->parse(MacinfoData);
	return Macro.get();
	}

	const DWARFLineTable *
	DWARFContext::getLineTableForUnit(DWARFUnit *U) {
	if (!Line)
	Line.reset(new DWARFDebugLine);

	auto UnitDIE = U->getUnitDIE();
	if (!UnitDIE)
	return nullptr;

	auto Offset = toSectionOffset(UnitDIE.find(DW_AT_stmt_list));
	if (!Offset)
	return nullptr; // No line table for this compile unit.

	uint32_t stmtOffset = *Offset + U->getLineTableOffset();
	// See if the line table is cached.
	if (const DWARFLineTable *lt = Line->getLineTable(stmtOffset))
	return lt;

	// Make sure the offset is good before we try to parse.
	if (stmtOffset >= U->getLineSection().Data.size())
	return nullptr;

	// We have to parse it first.
	DWARFDataExtractor lineData(U->getLineSection(), isLittleEndian(),
	U->getAddressByteSize());
	return Line->getOrParseLineTable(lineData, stmtOffset);
	}

	void DWARFContext::parseCompileUnits() {
	CUs.parse(*this, getInfoSection());
	}

	void DWARFContext::parseTypeUnits() {
	if (!TUs.empty())
	return;
	forEachTypesSections([&](const DWARFSection &S) {
	TUs.emplace_back();
	TUs.back().parse(*this, S);
	});
	}

	void DWARFContext::parseDWOCompileUnits() {
	DWOCUs.parseDWO(*this, getInfoDWOSection());
	}

	void DWARFContext::parseDWOTypeUnits() {
	if (!DWOTUs.empty())
	return;
	forEachTypesDWOSections([&](const DWARFSection &S) {
	DWOTUs.emplace_back();
	DWOTUs.back().parseDWO(*this, S);
	});
	}

	DWARFCompileUnit *DWARFContext::getCompileUnitForOffset(uint32_t Offset) {
	parseCompileUnits();
	return CUs.getUnitForOffset(Offset);
	}

	DWARFCompileUnit *DWARFContext::getCompileUnitForAddress(uint64_t Address) {
	// First, get the offset of the compile unit.
	uint32_t CUOffset = getDebugAranges()->findAddress(Address);
	// Retrieve the compile unit.
	return getCompileUnitForOffset(CUOffset);
	}

	static bool getFunctionNameAndStartLineForAddress(DWARFCompileUnit *CU,
	uint64_t Address,
	FunctionNameKind Kind,
	std::string &FunctionName,
	uint32_t &StartLine) {
	// The address may correspond to instruction in some inlined function,
	// so we have to build the chain of inlined functions and take the
	// name of the topmost function in it.
	SmallVector<DWARFDie, 4> InlinedChain;
	CU->getInlinedChainForAddress(Address, InlinedChain);
	if (InlinedChain.empty())
	return false;

	const DWARFDie &DIE = InlinedChain[0];
	bool FoundResult = false;
	const char *Name = nullptr;
	if (Kind != FunctionNameKind::None && (Name = DIE.getSubroutineName(Kind))) {
	FunctionName = Name;
	FoundResult = true;
	}
	if (auto DeclLineResult = DIE.getDeclLine()) {
	StartLine = DeclLineResult;
	FoundResult = true;
	}

	return FoundResult;
	}

	DILineInfo DWARFContext::getLineInfoForAddress(uint64_t Address,
	DILineInfoSpecifier Spec) {
	DILineInfo Result;

	DWARFCompileUnit *CU = getCompileUnitForAddress(Address);
	if (!CU)
	return Result;
	getFunctionNameAndStartLineForAddress(CU, Address, Spec.FNKind,
	Result.FunctionName,
	Result.StartLine);
	if (Spec.FLIKind != FileLineInfoKind::None) {
	if (const DWARFLineTable *LineTable = getLineTableForUnit(CU))
	LineTable->getFileLineInfoForAddress(Address, CU->getCompilationDir(),
	Spec.FLIKind, Result);
	}
	return Result;
	}

	DILineInfoTable
	DWARFContext::getLineInfoForAddressRange(uint64_t Address, uint64_t Size,
	DILineInfoSpecifier Spec) {
	DILineInfoTable Lines;
	DWARFCompileUnit *CU = getCompileUnitForAddress(Address);
	if (!CU)
	return Lines;

	std::string FunctionName = "<invalid>";
	uint32_t StartLine = 0;
	getFunctionNameAndStartLineForAddress(CU, Address, Spec.FNKind, FunctionName,
	StartLine);

	// If the Specifier says we don't need FileLineInfo, just
	// return the top-most function at the starting address.
	if (Spec.FLIKind == FileLineInfoKind::None) {
	DILineInfo Result;
	Result.FunctionName = FunctionName;
	Result.StartLine = StartLine;
	Lines.push_back(std::make_pair(Address, Result));
	return Lines;
	}

	const DWARFLineTable *LineTable = getLineTableForUnit(CU);

	// Get the index of row we're looking for in the line table.
	std::vector<uint32_t> RowVector;
	if (!LineTable->lookupAddressRange(Address, Size, RowVector))
	return Lines;

	for (uint32_t RowIndex : RowVector) {
	// Take file number and line/column from the row.
	const DWARFDebugLine::Row &Row = LineTable->Rows[RowIndex];
	DILineInfo Result;
	LineTable->getFileNameByIndex(Row.File, CU->getCompilationDir(),
	Spec.FLIKind, Result.FileName);
	Result.FunctionName = FunctionName;
	Result.Line = Row.Line;
	Result.Column = Row.Column;
	Result.StartLine = StartLine;
	Lines.push_back(std::make_pair(Row.Address, Result));
	}

	return Lines;
	}

	DIInliningInfo
	DWARFContext::getInliningInfoForAddress(uint64_t Address,
	DILineInfoSpecifier Spec) {
	DIInliningInfo InliningInfo;

	DWARFCompileUnit *CU = getCompileUnitForAddress(Address);
	if (!CU)
	return InliningInfo;

	const DWARFLineTable *LineTable = nullptr;
	SmallVector<DWARFDie, 4> InlinedChain;
	CU->getInlinedChainForAddress(Address, InlinedChain);
	if (InlinedChain.size() == 0) {
	// If there is no DIE for address (e.g. it is in unavailable .dwo file),
	// try to at least get file/line info from symbol table.
	if (Spec.FLIKind != FileLineInfoKind::None) {
	DILineInfo Frame;
	LineTable = getLineTableForUnit(CU);
	if (LineTable &&
	LineTable->getFileLineInfoForAddress(Address, CU->getCompilationDir(),
	Spec.FLIKind, Frame))
	InliningInfo.addFrame(Frame);
	}
	return InliningInfo;
	}

	uint32_t CallFile = 0, CallLine = 0, CallColumn = 0, CallDiscriminator = 0;
	for (uint32_t i = 0, n = InlinedChain.size(); i != n; i++) {
	DWARFDie &FunctionDIE = InlinedChain[i];
	DILineInfo Frame;
	// Get function name if necessary.
	if (const char *Name = FunctionDIE.getSubroutineName(Spec.FNKind))
	Frame.FunctionName = Name;
	if (auto DeclLineResult = FunctionDIE.getDeclLine())
	Frame.StartLine = DeclLineResult;
	if (Spec.FLIKind != FileLineInfoKind::None) {
	if (i == 0) {
	// For the topmost frame, initialize the line table of this
	// compile unit and fetch file/line info from it.
	LineTable = getLineTableForUnit(CU);
	// For the topmost routine, get file/line info from line table.
	if (LineTable)
	LineTable->getFileLineInfoForAddress(Address, CU->getCompilationDir(),
	Spec.FLIKind, Frame);
	} else {
	// Otherwise, use call file, call line and call column from
	// previous DIE in inlined chain.
	if (LineTable)
	LineTable->getFileNameByIndex(CallFile, CU->getCompilationDir(),
	Spec.FLIKind, Frame.FileName);
	Frame.Line = CallLine;
	Frame.Column = CallColumn;
	Frame.Discriminator = CallDiscriminator;
	}
	// Get call file/line/column of a current DIE.
	if (i + 1 < n) {
	FunctionDIE.getCallerFrame(CallFile, CallLine, CallColumn,
	CallDiscriminator);
	}
	}
	InliningInfo.addFrame(Frame);
	}
	return InliningInfo;
	}

	std::shared_ptr<DWARFContext>
	DWARFContext::getDWOContext(StringRef AbsolutePath) {
	if (auto S = DWP.lock()) {
	DWARFContext *Ctxt = S->Context.get();
	return std::shared_ptr<DWARFContext>(std::move(S), Ctxt);
	}

	std::weak_ptr<DWOFile> *Entry = &DWOFiles[AbsolutePath];

	if (auto S = Entry->lock()) {
	DWARFContext *Ctxt = S->Context.get();
	return std::shared_ptr<DWARFContext>(std::move(S), Ctxt);
	}

	SmallString<128> DWPName;
	Expected<OwningBinary<ObjectFile>> Obj = [&] {
	if (!CheckedForDWP) {
	(getFileName() + ".dwp").toVector(DWPName);
	auto Obj = object::ObjectFile::createObjectFile(DWPName);
	if (Obj) {
	Entry = &DWP;
	return Obj;
	} else {
	CheckedForDWP = true;
	// TODO: Should this error be handled (maybe in a high verbosity mode)
	// before falling back to .dwo files?
	consumeError(Obj.takeError());
	}
	}

	return object::ObjectFile::createObjectFile(AbsolutePath);
	}();

	if (!Obj) {
	// TODO: Actually report errors helpfully.
	consumeError(Obj.takeError());
	return nullptr;
	}

	auto S = std::make_shared<DWOFile>();
	S->File = std::move(Obj.get());
	S->Context = llvm::make_unique<DWARFContextInMemory>(*S->File.getBinary());
	*Entry = S;
	auto *Ctxt = S->Context.get();
	return std::shared_ptr<DWARFContext>(std::move(S), Ctxt);
	}

	static Error createError(const Twine &Reason, llvm::Error E) {
	return make_error<StringError>(Reason + toString(std::move(E)),
	inconvertibleErrorCode());
	}

	/// SymInfo contains information about symbol: it's address
	/// and section index which is -1LL for absolute symbols.
	struct SymInfo {
	uint64_t Address;
	uint64_t SectionIndex;
	};

	/// Returns the address of symbol relocation used against and a section index.
	/// Used for futher relocations computation. Symbol's section load address is
	static Expected<SymInfo> getSymbolInfo(const object::ObjectFile &Obj,
	const RelocationRef &Reloc,
	const LoadedObjectInfo *L,
	std::map<SymbolRef, SymInfo> &Cache) {
	SymInfo Ret = {0, (uint64_t)-1LL};
	object::section_iterator RSec = Obj.section_end();
	object::symbol_iterator Sym = Reloc.getSymbol();

	std::map<SymbolRef, SymInfo>::iterator CacheIt = Cache.end();
	// First calculate the address of the symbol or section as it appears
	// in the object file
	if (Sym != Obj.symbol_end()) {
	bool New;
	std::tie(CacheIt, New) = Cache.insert({*Sym, {0, 0}});
	if (!New)
	return CacheIt->second;

	Expected<uint64_t> SymAddrOrErr = Sym->getAddress();
	if (!SymAddrOrErr)
	return createError("failed to compute symbol address: ",
	SymAddrOrErr.takeError());

	// Also remember what section this symbol is in for later
	auto SectOrErr = Sym->getSection();
	if (!SectOrErr)
	return createError("failed to get symbol section: ",
	SectOrErr.takeError());

	RSec = *SectOrErr;
	Ret.Address = *SymAddrOrErr;
	} else if (auto *MObj = dyn_cast<MachOObjectFile>(&Obj)) {
	RSec = MObj->getRelocationSection(Reloc.getRawDataRefImpl());
	Ret.Address = RSec->getAddress();
	}

	if (RSec != Obj.section_end())
	Ret.SectionIndex = RSec->getIndex();

	// If we are given load addresses for the sections, we need to adjust:
	// SymAddr = (Address of Symbol Or Section in File) -
	// (Address of Section in File) +
	// (Load Address of Section)
	// RSec is now either the section being targeted or the section
	// containing the symbol being targeted. In either case,
	// we need to perform the same computation.
	if (L && RSec != Obj.section_end())
	if (uint64_t SectionLoadAddress = L->getSectionLoadAddress(*RSec))
	Ret.Address += SectionLoadAddress - RSec->getAddress();

	if (CacheIt != Cache.end())
	CacheIt->second = Ret;

	return Ret;
	}

	static bool isRelocScattered(const object::ObjectFile &Obj,
	const RelocationRef &Reloc) {
	const MachOObjectFile *MachObj = dyn_cast<MachOObjectFile>(&Obj);
	if (!MachObj)
	return false;
	// MachO also has relocations that point to sections and
	// scattered relocations.
	auto RelocInfo = MachObj->getRelocation(Reloc.getRawDataRefImpl());
	return MachObj->isRelocationScattered(RelocInfo);
	}

	Error DWARFContextInMemory::maybeDecompress(const SectionRef &Sec,
	StringRef Name, StringRef &Data) {
	if (!Decompressor::isCompressed(Sec))
	return Error::success();

	Expected<Decompressor> Decompressor =
	Decompressor::create(Name, Data, IsLittleEndian, AddressSize == 8);
	if (!Decompressor)
	return Decompressor.takeError();

	SmallString<32> Out;
	if (auto Err = Decompressor->resizeAndDecompress(Out))
	return Err;

	UncompressedSections.emplace_back(std::move(Out));
	Data = UncompressedSections.back();

	return Error::success();
	}

	ErrorPolicy DWARFContextInMemory::defaultErrorHandler(Error E) {
	errs() << "error: " + toString(std::move(E)) << '\n';
	return ErrorPolicy::Continue;
	}

	DWARFContextInMemory::DWARFContextInMemory(
	const object::ObjectFile &Obj, const LoadedObjectInfo *L,
	function_ref<ErrorPolicy(Error)> HandleError)
	: FileName(Obj.getFileName()), IsLittleEndian(Obj.isLittleEndian()),
	AddressSize(Obj.getBytesInAddress()) {
	for (const SectionRef &Section : Obj.sections()) {
	StringRef Name;
	Section.getName(Name);
	// Skip BSS and Virtual sections, they aren't interesting.
	if (Section.isBSS() \|\| Section.isVirtual())
	continue;

	StringRef Data;
	section_iterator RelocatedSection = Section.getRelocatedSection();
	// Try to obtain an already relocated version of this section.
	// Else use the unrelocated section from the object file. We'll have to
	// apply relocations ourselves later.
	if (!L \|\| !L->getLoadedSectionContents(*RelocatedSection, Data))
	Section.getContents(Data);

	if (auto Err = maybeDecompress(Section, Name, Data)) {
	ErrorPolicy EP = HandleError(
	createError("failed to decompress '" + Name + "', ", std::move(Err)));
	if (EP == ErrorPolicy::Halt)
	return;
	continue;
	}

	// Compressed sections names in GNU style starts from ".z",
	// at this point section is decompressed and we drop compression prefix.
	Name = Name.substr(
	Name.find_first_not_of("._z")); // Skip ".", "z" and "_" prefixes.

	// Map platform specific debug section names to DWARF standard section
	// names.
	Name = Obj.mapDebugSectionName(Name);

	if (StringRef *SectionData = mapSectionToMember(Name)) {
	*SectionData = Data;
	if (Name == "debug_ranges") {
	// FIXME: Use the other dwo range section when we emit it.
	RangeDWOSection.Data = Data;
	}
	} else if (Name == "debug_types") {
	// Find debug_types data by section rather than name as there are
	// multiple, comdat grouped, debug_types sections.
	TypesSections[Section].Data = Data;
	} else if (Name == "debug_types.dwo") {
	TypesDWOSections[Section].Data = Data;
	}

	if (RelocatedSection == Obj.section_end())
	continue;

	StringRef RelSecName;
	StringRef RelSecData;
	RelocatedSection->getName(RelSecName);

	// If the section we're relocating was relocated already by the JIT,
	// then we used the relocated version above, so we do not need to process
	// relocations for it now.
	if (L && L->getLoadedSectionContents(*RelocatedSection, RelSecData))
	continue;

	// In Mach-o files, the relocations do not need to be applied if
	// there is no load offset to apply. The value read at the
	// relocation point already factors in the section address
	// (actually applying the relocations will produce wrong results
	// as the section address will be added twice).
	if (!L && isa<MachOObjectFile>(&Obj))
	continue;

	RelSecName = RelSecName.substr(
	RelSecName.find_first_not_of("._z")); // Skip . and _ prefixes.

	// TODO: Add support for relocations in other sections as needed.
	// Record relocations for the debug_info and debug_line sections.
	DWARFSection *Sec = mapNameToDWARFSection(RelSecName);
	RelocAddrMap *Map = Sec ? &Sec->Relocs : nullptr;
	if (!Map) {
	// Find debug_types relocs by section rather than name as there are
	// multiple, comdat grouped, debug_types sections.
	if (RelSecName == "debug_types")
	Map = &TypesSections[*RelocatedSection].Relocs;
	else if (RelSecName == "debug_types.dwo")
	Map = &TypesDWOSections[*RelocatedSection].Relocs;
	else
	continue;
	}

	if (Section.relocation_begin() == Section.relocation_end())
	continue;

	// Symbol to [address, section index] cache mapping.
	std::map<SymbolRef, SymInfo> AddrCache;
	for (const RelocationRef &Reloc : Section.relocations()) {
	// FIXME: it's not clear how to correctly handle scattered
	// relocations.
	if (isRelocScattered(Obj, Reloc))
	continue;

	Expected<SymInfo> SymInfoOrErr = getSymbolInfo(Obj, Reloc, L, AddrCache);
	if (!SymInfoOrErr) {
	if (HandleError(SymInfoOrErr.takeError()) == ErrorPolicy::Halt)
	return;
	continue;
	}

	object::RelocVisitor V(Obj);
	uint64_t Val = V.visit(Reloc.getType(), Reloc, SymInfoOrErr->Address);
	if (V.error()) {
	SmallString<32> Type;
	Reloc.getTypeName(Type);
	ErrorPolicy EP = HandleError(
	createError("failed to compute relocation: " + Type + ", ",
	errorCodeToError(object_error::parse_failed)));
	if (EP == ErrorPolicy::Halt)
	return;
	continue;
	}
	RelocAddrEntry Rel = {SymInfoOrErr->SectionIndex, Val};
	Map->insert({Reloc.getOffset(), Rel});
	}
	}
	}

	DWARFContextInMemory::DWARFContextInMemory(
	const StringMap<std::unique_ptr<MemoryBuffer>> &Sections, uint8_t AddrSize,
	bool isLittleEndian)
	: IsLittleEndian(isLittleEndian), AddressSize(AddrSize) {
	for (const auto &SecIt : Sections) {
	if (StringRef *SectionData = mapSectionToMember(SecIt.first()))
	*SectionData = SecIt.second->getBuffer();
	}
	}

	DWARFSection *DWARFContextInMemory::mapNameToDWARFSection(StringRef Name) {
	return StringSwitch<DWARFSection *>(Name)
	.Case("debug_info", &InfoSection)
	.Case("debug_loc", &LocSection)
	.Case("debug_line", &LineSection)
	.Case("debug_str_offsets", &StringOffsetSection)
	.Case("debug_ranges", &RangeSection)
	.Case("debug_info.dwo", &InfoDWOSection)
	.Case("debug_loc.dwo", &LocDWOSection)
	.Case("debug_line.dwo", &LineDWOSection)
	.Case("debug_str_offsets.dwo", &StringOffsetDWOSection)
	.Case("debug_addr", &AddrSection)
	.Case("apple_names", &AppleNamesSection)
	.Case("apple_types", &AppleTypesSection)
	.Case("apple_namespaces", &AppleNamespacesSection)
	.Case("apple_namespac", &AppleNamespacesSection)
	.Case("apple_objc", &AppleObjCSection)
	.Default(nullptr);
	}

	StringRef *DWARFContextInMemory::mapSectionToMember(StringRef Name) {
	if (DWARFSection *Sec = mapNameToDWARFSection(Name))
	return &Sec->Data;
	return StringSwitch<StringRef *>(Name)
	.Case("debug_abbrev", &AbbrevSection)
	.Case("debug_aranges", &ARangeSection)
	.Case("debug_frame", &DebugFrameSection)
	.Case("eh_frame", &EHFrameSection)
	.Case("debug_str", &StringSection)
	.Case("debug_macinfo", &MacinfoSection)
	.Case("debug_pubnames", &PubNamesSection)
	.Case("debug_pubtypes", &PubTypesSection)
	.Case("debug_gnu_pubnames", &GnuPubNamesSection)
	.Case("debug_gnu_pubtypes", &GnuPubTypesSection)
	.Case("debug_abbrev.dwo", &AbbrevDWOSection)
	.Case("debug_str.dwo", &StringDWOSection)
	.Case("debug_cu_index", &CUIndexSection)
	.Case("debug_tu_index", &TUIndexSection)
	.Case("gdb_index", &GdbIndexSection)
	// Any more debug info sections go here.
	.Default(nullptr);
	}

	void DWARFContextInMemory::anchor() {}
	diff --git a/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/lib/DebugInfo/DWARF/DWARFVerifier.cpp
	index 6cf44ffa3796..4de46bea301e 100644
	--- a/lib/DebugInfo/DWARF/DWARFVerifier.cpp
	+++ b/lib/DebugInfo/DWARF/DWARFVerifier.cpp
	@@ -1,499 +1,499 @@
	//===- DWARFVerifier.cpp --------------------------------------------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/DebugInfo/DWARF/DWARFVerifier.h"
	#include "llvm/DebugInfo/DWARF/DWARFCompileUnit.h"
	#include "llvm/DebugInfo/DWARF/DWARFContext.h"
	#include "llvm/DebugInfo/DWARF/DWARFDebugLine.h"
	#include "llvm/DebugInfo/DWARF/DWARFDie.h"
	#include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
	#include "llvm/DebugInfo/DWARF/DWARFSection.h"
	#include "llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h"
	#include "llvm/Support/raw_ostream.h"
	#include <map>
	#include <set>
	#include <vector>

	using namespace llvm;
	using namespace dwarf;
	using namespace object;

	bool DWARFVerifier::verifyUnitHeader(const DWARFDataExtractor DebugInfoData,
	uint32_t *Offset, unsigned UnitIndex,
	uint8_t &UnitType, bool &isUnitDWARF64) {
	uint32_t AbbrOffset, Length;
	uint8_t AddrSize = 0;
	uint16_t Version;
	bool Success = true;

	bool ValidLength = false;
	bool ValidVersion = false;
	bool ValidAddrSize = false;
	bool ValidType = true;
	bool ValidAbbrevOffset = true;

	uint32_t OffsetStart = *Offset;
	Length = DebugInfoData.getU32(Offset);
	if (Length == UINT32_MAX) {
	isUnitDWARF64 = true;
	OS << format(
	"Unit[%d] is in 64-bit DWARF format; cannot verify from this point.\n",
	UnitIndex);
	return false;
	}
	Version = DebugInfoData.getU16(Offset);

	if (Version >= 5) {
	UnitType = DebugInfoData.getU8(Offset);
	AddrSize = DebugInfoData.getU8(Offset);
	AbbrOffset = DebugInfoData.getU32(Offset);
	ValidType = DWARFUnit::isValidUnitType(UnitType);
	} else {
	UnitType = 0;
	AbbrOffset = DebugInfoData.getU32(Offset);
	AddrSize = DebugInfoData.getU8(Offset);
	}

	if (!DCtx.getDebugAbbrev()->getAbbreviationDeclarationSet(AbbrOffset))
	ValidAbbrevOffset = false;

	ValidLength = DebugInfoData.isValidOffset(OffsetStart + Length + 3);
	ValidVersion = DWARFContext::isSupportedVersion(Version);
	ValidAddrSize = AddrSize == 4 \|\| AddrSize == 8;
	if (!ValidLength \|\| !ValidVersion \|\| !ValidAddrSize \|\| !ValidAbbrevOffset \|\|
	!ValidType) {
	Success = false;
	OS << format("Units[%d] - start offset: 0x%08x \n", UnitIndex, OffsetStart);
	if (!ValidLength)
	OS << "\tError: The length for this unit is too "
	"large for the .debug_info provided.\n";
	if (!ValidVersion)
	OS << "\tError: The 16 bit unit header version is not valid.\n";
	if (!ValidType)
	OS << "\tError: The unit type encoding is not valid.\n";
	if (!ValidAbbrevOffset)
	OS << "\tError: The offset into the .debug_abbrev section is "
	"not valid.\n";
	if (!ValidAddrSize)
	OS << "\tError: The address size is unsupported.\n";
	}
	*Offset = OffsetStart + Length + 4;
	return Success;
	}

	bool DWARFVerifier::verifyUnitContents(DWARFUnit Unit) {
	uint32_t NumUnitErrors = 0;
	unsigned NumDies = Unit.getNumDIEs();
	for (unsigned I = 0; I < NumDies; ++I) {
	auto Die = Unit.getDIEAtIndex(I);
	if (Die.getTag() == DW_TAG_null)
	continue;
	for (auto AttrValue : Die.attributes()) {
	NumUnitErrors += verifyDebugInfoAttribute(Die, AttrValue);
	NumUnitErrors += verifyDebugInfoForm(Die, AttrValue);
	}
	}
	return NumUnitErrors == 0;
	}

	bool DWARFVerifier::handleDebugInfo() {
	OS << "Verifying .debug_info Unit Header Chain...\n";

	DWARFDataExtractor DebugInfoData(DCtx.getInfoSection(), DCtx.isLittleEndian(),
	0);
	uint32_t NumDebugInfoErrors = 0;
	uint32_t OffsetStart = 0, Offset = 0, UnitIdx = 0;
	uint8_t UnitType = 0;
	bool isUnitDWARF64 = false;
	bool isHeaderChainValid = true;
	bool hasDIE = DebugInfoData.isValidOffset(Offset);
	while (hasDIE) {
	OffsetStart = Offset;
	if (!verifyUnitHeader(DebugInfoData, &Offset, UnitIdx, UnitType,
	isUnitDWARF64)) {
	isHeaderChainValid = false;
	if (isUnitDWARF64)
	break;
	} else {
	std::unique_ptr<DWARFUnit> Unit;
	switch (UnitType) {
	case dwarf::DW_UT_type:
	case dwarf::DW_UT_split_type: {
	DWARFUnitSection<DWARFTypeUnit> TUSection{};
	Unit.reset(new DWARFTypeUnit(
	DCtx, DCtx.getInfoSection(), DCtx.getDebugAbbrev(),
	&DCtx.getRangeSection(), DCtx.getStringSection(),
	DCtx.getStringOffsetSection(), &DCtx.getAppleObjCSection(),
	DCtx.getLineSection(), DCtx.isLittleEndian(), false, TUSection,
	nullptr));
	break;
	}
	case dwarf::DW_UT_skeleton:
	case dwarf::DW_UT_split_compile:
	case dwarf::DW_UT_compile:
	case dwarf::DW_UT_partial:
	// UnitType = 0 means that we are
	// verifying a compile unit in DWARF v4.
	case 0: {
	DWARFUnitSection<DWARFCompileUnit> CUSection{};
	Unit.reset(new DWARFCompileUnit(
	DCtx, DCtx.getInfoSection(), DCtx.getDebugAbbrev(),
	&DCtx.getRangeSection(), DCtx.getStringSection(),
	DCtx.getStringOffsetSection(), &DCtx.getAppleObjCSection(),
	DCtx.getLineSection(), DCtx.isLittleEndian(), false, CUSection,
	nullptr));
	break;
	}
	default: { llvm_unreachable("Invalid UnitType."); }
	}
	Unit->extract(DebugInfoData, &OffsetStart);
	if (!verifyUnitContents(*Unit))
	++NumDebugInfoErrors;
	}
	hasDIE = DebugInfoData.isValidOffset(Offset);
	++UnitIdx;
	}
	if (UnitIdx == 0 && !hasDIE) {
	OS << "Warning: .debug_info is empty.\n";
	isHeaderChainValid = true;
	}
	NumDebugInfoErrors += verifyDebugInfoReferences();
	return (isHeaderChainValid && NumDebugInfoErrors == 0);
	}

	unsigned DWARFVerifier::verifyDebugInfoAttribute(const DWARFDie &Die,
	DWARFAttribute &AttrValue) {
	unsigned NumErrors = 0;
	const auto Attr = AttrValue.Attr;
	switch (Attr) {
	case DW_AT_ranges:
	// Make sure the offset in the DW_AT_ranges attribute is valid.
	if (auto SectionOffset = AttrValue.Value.getAsSectionOffset()) {
	if (*SectionOffset >= DCtx.getRangeSection().Data.size()) {
	++NumErrors;
	OS << "error: DW_AT_ranges offset is beyond .debug_ranges "
	"bounds:\n";
	Die.dump(OS, 0);
	OS << "\n";
	}
	} else {
	++NumErrors;
	OS << "error: DIE has invalid DW_AT_ranges encoding:\n";
	Die.dump(OS, 0);
	OS << "\n";
	}
	break;
	case DW_AT_stmt_list:
	// Make sure the offset in the DW_AT_stmt_list attribute is valid.
	if (auto SectionOffset = AttrValue.Value.getAsSectionOffset()) {
	if (*SectionOffset >= DCtx.getLineSection().Data.size()) {
	++NumErrors;
	OS << "error: DW_AT_stmt_list offset is beyond .debug_line "
	"bounds: "
	- << format("0x%08" PRIx32, *SectionOffset) << "\n";
	+ << format("0x%08" PRIx64, *SectionOffset) << "\n";
	Die.dump(OS, 0);
	OS << "\n";
	}
	} else {
	++NumErrors;
	OS << "error: DIE has invalid DW_AT_stmt_list encoding:\n";
	Die.dump(OS, 0);
	OS << "\n";
	}
	break;

	default:
	break;
	}
	return NumErrors;
	}

	unsigned DWARFVerifier::verifyDebugInfoForm(const DWARFDie &Die,
	DWARFAttribute &AttrValue) {
	unsigned NumErrors = 0;
	const auto Form = AttrValue.Value.getForm();
	switch (Form) {
	case DW_FORM_ref1:
	case DW_FORM_ref2:
	case DW_FORM_ref4:
	case DW_FORM_ref8:
	case DW_FORM_ref_udata: {
	// Verify all CU relative references are valid CU offsets.
	Optional<uint64_t> RefVal = AttrValue.Value.getAsReference();
	assert(RefVal);
	if (RefVal) {
	auto DieCU = Die.getDwarfUnit();
	auto CUSize = DieCU->getNextUnitOffset() - DieCU->getOffset();
	auto CUOffset = AttrValue.Value.getRawUValue();
	if (CUOffset >= CUSize) {
	++NumErrors;
	OS << "error: " << FormEncodingString(Form) << " CU offset "
	- << format("0x%08" PRIx32, CUOffset)
	+ << format("0x%08" PRIx64, CUOffset)
	<< " is invalid (must be less than CU size of "
	<< format("0x%08" PRIx32, CUSize) << "):\n";
	Die.dump(OS, 0);
	OS << "\n";
	} else {
	// Valid reference, but we will verify it points to an actual
	// DIE later.
	ReferenceToDIEOffsets[*RefVal].insert(Die.getOffset());
	}
	}
	break;
	}
	case DW_FORM_ref_addr: {
	// Verify all absolute DIE references have valid offsets in the
	// .debug_info section.
	Optional<uint64_t> RefVal = AttrValue.Value.getAsReference();
	assert(RefVal);
	if (RefVal) {
	if (*RefVal >= DCtx.getInfoSection().Data.size()) {
	++NumErrors;
	OS << "error: DW_FORM_ref_addr offset beyond .debug_info "
	"bounds:\n";
	Die.dump(OS, 0);
	OS << "\n";
	} else {
	// Valid reference, but we will verify it points to an actual
	// DIE later.
	ReferenceToDIEOffsets[*RefVal].insert(Die.getOffset());
	}
	}
	break;
	}
	case DW_FORM_strp: {
	auto SecOffset = AttrValue.Value.getAsSectionOffset();
	assert(SecOffset); // DW_FORM_strp is a section offset.
	if (SecOffset && *SecOffset >= DCtx.getStringSection().size()) {
	++NumErrors;
	OS << "error: DW_FORM_strp offset beyond .debug_str bounds:\n";
	Die.dump(OS, 0);
	OS << "\n";
	}
	break;
	}
	default:
	break;
	}
	return NumErrors;
	}

	unsigned DWARFVerifier::verifyDebugInfoReferences() {
	// Take all references and make sure they point to an actual DIE by
	// getting the DIE by offset and emitting an error
	OS << "Verifying .debug_info references...\n";
	unsigned NumErrors = 0;
	for (auto Pair : ReferenceToDIEOffsets) {
	auto Die = DCtx.getDIEForOffset(Pair.first);
	if (Die)
	continue;
	++NumErrors;
	OS << "error: invalid DIE reference " << format("0x%08" PRIx64, Pair.first)
	<< ". Offset is in between DIEs:\n";
	for (auto Offset : Pair.second) {
	auto ReferencingDie = DCtx.getDIEForOffset(Offset);
	ReferencingDie.dump(OS, 0);
	OS << "\n";
	}
	OS << "\n";
	}
	return NumErrors;
	}

	void DWARFVerifier::verifyDebugLineStmtOffsets() {
	std::map<uint64_t, DWARFDie> StmtListToDie;
	for (const auto &CU : DCtx.compile_units()) {
	auto Die = CU->getUnitDIE();
	// Get the attribute value as a section offset. No need to produce an
	// error here if the encoding isn't correct because we validate this in
	// the .debug_info verifier.
	auto StmtSectionOffset = toSectionOffset(Die.find(DW_AT_stmt_list));
	if (!StmtSectionOffset)
	continue;
	const uint32_t LineTableOffset = *StmtSectionOffset;
	auto LineTable = DCtx.getLineTableForUnit(CU.get());
	if (LineTableOffset < DCtx.getLineSection().Data.size()) {
	if (!LineTable) {
	++NumDebugLineErrors;
	OS << "error: .debug_line[" << format("0x%08" PRIx32, LineTableOffset)
	<< "] was not able to be parsed for CU:\n";
	Die.dump(OS, 0);
	OS << '\n';
	continue;
	}
	} else {
	// Make sure we don't get a valid line table back if the offset is wrong.
	assert(LineTable == nullptr);
	// Skip this line table as it isn't valid. No need to create an error
	// here because we validate this in the .debug_info verifier.
	continue;
	}
	auto Iter = StmtListToDie.find(LineTableOffset);
	if (Iter != StmtListToDie.end()) {
	++NumDebugLineErrors;
	OS << "error: two compile unit DIEs, "
	<< format("0x%08" PRIx32, Iter->second.getOffset()) << " and "
	<< format("0x%08" PRIx32, Die.getOffset())
	<< ", have the same DW_AT_stmt_list section offset:\n";
	Iter->second.dump(OS, 0);
	Die.dump(OS, 0);
	OS << '\n';
	// Already verified this line table before, no need to do it again.
	continue;
	}
	StmtListToDie[LineTableOffset] = Die;
	}
	}

	void DWARFVerifier::verifyDebugLineRows() {
	for (const auto &CU : DCtx.compile_units()) {
	auto Die = CU->getUnitDIE();
	auto LineTable = DCtx.getLineTableForUnit(CU.get());
	// If there is no line table we will have created an error in the
	// .debug_info verifier or in verifyDebugLineStmtOffsets().
	if (!LineTable)
	continue;
	uint32_t MaxFileIndex = LineTable->Prologue.FileNames.size();
	uint64_t PrevAddress = 0;
	uint32_t RowIndex = 0;
	for (const auto &Row : LineTable->Rows) {
	if (Row.Address < PrevAddress) {
	++NumDebugLineErrors;
	OS << "error: .debug_line["
	- << format("0x%08" PRIx32,
	+ << format("0x%08" PRIx64,
	*toSectionOffset(Die.find(DW_AT_stmt_list)))
	<< "] row[" << RowIndex
	<< "] decreases in address from previous row:\n";

	DWARFDebugLine::Row::dumpTableHeader(OS);
	if (RowIndex > 0)
	LineTable->Rows[RowIndex - 1].dump(OS);
	Row.dump(OS);
	OS << '\n';
	}

	if (Row.File > MaxFileIndex) {
	++NumDebugLineErrors;
	OS << "error: .debug_line["
	- << format("0x%08" PRIx32,
	+ << format("0x%08" PRIx64,
	*toSectionOffset(Die.find(DW_AT_stmt_list)))
	<< "][" << RowIndex << "] has invalid file index " << Row.File
	<< " (valid values are [1," << MaxFileIndex << "]):\n";
	DWARFDebugLine::Row::dumpTableHeader(OS);
	Row.dump(OS);
	OS << '\n';
	}
	if (Row.EndSequence)
	PrevAddress = 0;
	else
	PrevAddress = Row.Address;
	++RowIndex;
	}
	}
	}

	bool DWARFVerifier::handleDebugLine() {
	NumDebugLineErrors = 0;
	OS << "Verifying .debug_line...\n";
	verifyDebugLineStmtOffsets();
	verifyDebugLineRows();
	return NumDebugLineErrors == 0;
	}

	bool DWARFVerifier::handleAppleNames() {
	NumAppleNamesErrors = 0;

	DWARFDataExtractor AppleNamesSection(DCtx.getAppleNamesSection(),
	DCtx.isLittleEndian(), 0);
	DataExtractor StrData(DCtx.getStringSection(), DCtx.isLittleEndian(), 0);
	DWARFAcceleratorTable AppleNames(AppleNamesSection, StrData);

	if (!AppleNames.extract()) {
	return true;
	}

	OS << "Verifying .apple_names...\n";

	// Verify that all buckets have a valid hash index or are empty.
	uint32_t NumBuckets = AppleNames.getNumBuckets();
	uint32_t NumHashes = AppleNames.getNumHashes();

	uint32_t BucketsOffset =
	AppleNames.getSizeHdr() + AppleNames.getHeaderDataLength();
	uint32_t HashesBase = BucketsOffset + NumBuckets * 4;
	uint32_t OffsetsBase = HashesBase + NumHashes * 4;

	for (uint32_t BucketIdx = 0; BucketIdx < NumBuckets; ++BucketIdx) {
	uint32_t HashIdx = AppleNamesSection.getU32(&BucketsOffset);
	if (HashIdx >= NumHashes && HashIdx != UINT32_MAX) {
	OS << format("error: Bucket[%d] has invalid hash index: %u\n", BucketIdx,
	HashIdx);
	++NumAppleNamesErrors;
	}
	}

	uint32_t NumAtoms = AppleNames.getAtomsDesc().size();
	if (NumAtoms == 0) {
	OS << "error: no atoms; failed to read HashData\n";
	++NumAppleNamesErrors;
	return false;
	}

	if (!AppleNames.validateForms()) {
	OS << "error: unsupported form; failed to read HashData\n";
	++NumAppleNamesErrors;
	return false;
	}

	for (uint32_t HashIdx = 0; HashIdx < NumHashes; ++HashIdx) {
	uint32_t HashOffset = HashesBase + 4 * HashIdx;
	uint32_t DataOffset = OffsetsBase + 4 * HashIdx;
	uint32_t Hash = AppleNamesSection.getU32(&HashOffset);
	uint32_t HashDataOffset = AppleNamesSection.getU32(&DataOffset);
	if (!AppleNamesSection.isValidOffsetForDataOfSize(HashDataOffset,
	sizeof(uint64_t))) {
	OS << format("error: Hash[%d] has invalid HashData offset: 0x%08x\n",
	HashIdx, HashDataOffset);
	++NumAppleNamesErrors;
	}

	uint32_t StrpOffset;
	uint32_t StringOffset;
	uint32_t StringCount = 0;
	uint32_t DieOffset = dwarf::DW_INVALID_OFFSET;

	while ((StrpOffset = AppleNamesSection.getU32(&HashDataOffset)) != 0) {
	const uint32_t NumHashDataObjects =
	AppleNamesSection.getU32(&HashDataOffset);
	for (uint32_t HashDataIdx = 0; HashDataIdx < NumHashDataObjects;
	++HashDataIdx) {
	DieOffset = AppleNames.readAtoms(HashDataOffset);
	if (!DCtx.getDIEForOffset(DieOffset)) {
	const uint32_t BucketIdx =
	NumBuckets ? (Hash % NumBuckets) : UINT32_MAX;
	StringOffset = StrpOffset;
	const char *Name = StrData.getCStr(&StringOffset);
	if (!Name)
	Name = "<NULL>";

	OS << format(
	"error: .apple_names Bucket[%d] Hash[%d] = 0x%08x "
	"Str[%u] = 0x%08x "
	"DIE[%d] = 0x%08x is not a valid DIE offset for \"%s\".\n",
	BucketIdx, HashIdx, Hash, StringCount, StrpOffset, HashDataIdx,
	DieOffset, Name);

	++NumAppleNamesErrors;
	}
	}
	++StringCount;
	}
	}
	return NumAppleNamesErrors == 0;
	}
	diff --git a/lib/Object/COFFImportFile.cpp b/lib/Object/COFFImportFile.cpp
	index a515bc8ad16d..ff039463d08c 100644
	--- a/lib/Object/COFFImportFile.cpp
	+++ b/lib/Object/COFFImportFile.cpp
	@@ -1,612 +1,612 @@
	//===- COFFImportFile.cpp - COFF short import file implementation ---------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines the writeImportLibrary function.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/Object/COFFImportFile.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/Object/Archive.h"
	#include "llvm/Object/ArchiveWriter.h"
	#include "llvm/Object/COFF.h"
	#include "llvm/Support/Error.h"
	#include "llvm/Support/Path.h"

	#include <cstdint>
	#include <map>
	#include <set>
	#include <string>
	#include <vector>

	using namespace llvm::COFF;
	using namespace llvm::object;
	using namespace llvm;

	namespace llvm {
	namespace object {

	static bool is32bit(MachineTypes Machine) {
	switch (Machine) {
	default:
	llvm_unreachable("unsupported machine");
	case IMAGE_FILE_MACHINE_AMD64:
	return false;
	case IMAGE_FILE_MACHINE_ARMNT:
	case IMAGE_FILE_MACHINE_I386:
	return true;
	}
	}

	static uint16_t getImgRelRelocation(MachineTypes Machine) {
	switch (Machine) {
	default:
	llvm_unreachable("unsupported machine");
	case IMAGE_FILE_MACHINE_AMD64:
	return IMAGE_REL_AMD64_ADDR32NB;
	case IMAGE_FILE_MACHINE_ARMNT:
	return IMAGE_REL_ARM_ADDR32NB;
	case IMAGE_FILE_MACHINE_I386:
	return IMAGE_REL_I386_DIR32NB;
	}
	}

	template <class T> static void append(std::vector<uint8_t> &B, const T &Data) {
	size_t S = B.size();
	B.resize(S + sizeof(T));
	memcpy(&B[S], &Data, sizeof(T));
	}

	static void writeStringTable(std::vector<uint8_t> &B,
	ArrayRef<const std::string> Strings) {
	// The COFF string table consists of a 4-byte value which is the size of the
	// table, including the length field itself. This value is followed by the
	// string content itself, which is an array of null-terminated C-style
	// strings. The termination is important as they are referenced to by offset
	// by the symbol entity in the file format.

	size_t Pos = B.size();
	size_t Offset = B.size();

	// Skip over the length field, we will fill it in later as we will have
	// computed the length while emitting the string content itself.
	Pos += sizeof(uint32_t);

	for (const auto &S : Strings) {
	B.resize(Pos + S.length() + 1);
	strcpy(reinterpret_cast<char *>(&B[Pos]), S.c_str());
	Pos += S.length() + 1;
	}

	// Backfill the length of the table now that it has been computed.
	support::ulittle32_t Length(B.size() - Offset);
	support::endian::write32le(&B[Offset], Length);
	}

	static ImportNameType getNameType(StringRef Sym, StringRef ExtName,
	MachineTypes Machine) {
	if (Sym != ExtName)
	return IMPORT_NAME_UNDECORATE;
	if (Machine == IMAGE_FILE_MACHINE_I386 && Sym.startswith("_"))
	return IMPORT_NAME_NOPREFIX;
	return IMPORT_NAME;
	}

	static Expected<std::string> replace(StringRef S, StringRef From,
	StringRef To) {
	size_t Pos = S.find(From);

	// From and To may be mangled, but substrings in S may not.
	if (Pos == StringRef::npos && From.startswith("_") && To.startswith("_")) {
	From = From.substr(1);
	To = To.substr(1);
	Pos = S.find(From);
	}

	if (Pos == StringRef::npos) {
	return make_error<StringError>(
	StringRef(Twine(S + ": replacing '" + From +
	"' with '" + To + "' failed").str()), object_error::parse_failed);
	}

	return (Twine(S.substr(0, Pos)) + To + S.substr(Pos + From.size())).str();
	}

	static const std::string NullImportDescriptorSymbolName =
	"__NULL_IMPORT_DESCRIPTOR";

	namespace {
	// This class constructs various small object files necessary to support linking
	// symbols imported from a DLL. The contents are pretty strictly defined and
	// nearly entirely static. The details of the structures files are defined in
	// WINNT.h and the PE/COFF specification.
	class ObjectFactory {
	using u16 = support::ulittle16_t;
	using u32 = support::ulittle32_t;
	MachineTypes Machine;
	BumpPtrAllocator Alloc;
	StringRef ImportName;
	StringRef Library;
	std::string ImportDescriptorSymbolName;
	std::string NullThunkSymbolName;

	public:
	ObjectFactory(StringRef S, MachineTypes M)
	: Machine(M), ImportName(S), Library(S.drop_back(4)),
	ImportDescriptorSymbolName(("__IMPORT_DESCRIPTOR_" + Library).str()),
	NullThunkSymbolName(("\x7f" + Library + "_NULL_THUNK_DATA").str()) {}

	// Creates an Import Descriptor. This is a small object file which contains a
	// reference to the terminators and contains the library name (entry) for the
	// import name table. It will force the linker to construct the necessary
	// structure to import symbols from the DLL.
	NewArchiveMember createImportDescriptor(std::vector<uint8_t> &Buffer);

	// Creates a NULL import descriptor. This is a small object file whcih
	// contains a NULL import descriptor. It is used to terminate the imports
	// from a specific DLL.
	NewArchiveMember createNullImportDescriptor(std::vector<uint8_t> &Buffer);

	// Create a NULL Thunk Entry. This is a small object file which contains a
	// NULL Import Address Table entry and a NULL Import Lookup Table Entry. It
	// is used to terminate the IAT and ILT.
	NewArchiveMember createNullThunk(std::vector<uint8_t> &Buffer);

	// Create a short import file which is described in PE/COFF spec 7. Import
	// Library Format.
	NewArchiveMember createShortImport(StringRef Sym, uint16_t Ordinal,
	ImportType Type, ImportNameType NameType);

	// Create a weak external file which is described in PE/COFF Aux Format 3.
	NewArchiveMember createWeakExternal(StringRef Sym, StringRef Weak, bool Imp);
	};
	} // namespace

	NewArchiveMember
	ObjectFactory::createImportDescriptor(std::vector<uint8_t> &Buffer) {
	const uint32_t NumberOfSections = 2;
	const uint32_t NumberOfSymbols = 7;
	const uint32_t NumberOfRelocations = 3;

	// COFF Header
	coff_file_header Header{
	u16(Machine),
	u16(NumberOfSections),
	u32(0),
	u32(sizeof(Header) + (NumberOfSections * sizeof(coff_section)) +
	// .idata$2
	sizeof(coff_import_directory_table_entry) +
	NumberOfRelocations * sizeof(coff_relocation) +
	// .idata$4
	(ImportName.size() + 1)),
	u32(NumberOfSymbols),
	u16(0),
	u16(is32bit(Machine) ? IMAGE_FILE_32BIT_MACHINE : 0),
	};
	append(Buffer, Header);

	// Section Header Table
	const coff_section SectionTable[NumberOfSections] = {
	{{'.', 'i', 'd', 'a', 't', 'a', '$', '2'},
	u32(0),
	u32(0),
	u32(sizeof(coff_import_directory_table_entry)),
	u32(sizeof(coff_file_header) + NumberOfSections * sizeof(coff_section)),
	u32(sizeof(coff_file_header) + NumberOfSections * sizeof(coff_section) +
	sizeof(coff_import_directory_table_entry)),
	u32(0),
	u16(NumberOfRelocations),
	u16(0),
	u32(IMAGE_SCN_ALIGN_4BYTES \| IMAGE_SCN_CNT_INITIALIZED_DATA \|
	IMAGE_SCN_MEM_READ \| IMAGE_SCN_MEM_WRITE)},
	{{'.', 'i', 'd', 'a', 't', 'a', '$', '6'},
	u32(0),
	u32(0),
	u32(ImportName.size() + 1),
	u32(sizeof(coff_file_header) + NumberOfSections * sizeof(coff_section) +
	sizeof(coff_import_directory_table_entry) +
	NumberOfRelocations * sizeof(coff_relocation)),
	u32(0),
	u32(0),
	u16(0),
	u16(0),
	u32(IMAGE_SCN_ALIGN_2BYTES \| IMAGE_SCN_CNT_INITIALIZED_DATA \|
	IMAGE_SCN_MEM_READ \| IMAGE_SCN_MEM_WRITE)},
	};
	append(Buffer, SectionTable);

	// .idata$2
	const coff_import_directory_table_entry ImportDescriptor{
	u32(0), u32(0), u32(0), u32(0), u32(0),
	};
	append(Buffer, ImportDescriptor);

	const coff_relocation RelocationTable[NumberOfRelocations] = {
	{u32(offsetof(coff_import_directory_table_entry, NameRVA)), u32(2),
	u16(getImgRelRelocation(Machine))},
	{u32(offsetof(coff_import_directory_table_entry, ImportLookupTableRVA)),
	u32(3), u16(getImgRelRelocation(Machine))},
	{u32(offsetof(coff_import_directory_table_entry, ImportAddressTableRVA)),
	u32(4), u16(getImgRelRelocation(Machine))},
	};
	append(Buffer, RelocationTable);

	// .idata$6
	auto S = Buffer.size();
	Buffer.resize(S + ImportName.size() + 1);
	memcpy(&Buffer[S], ImportName.data(), ImportName.size());
	Buffer[S + ImportName.size()] = '\0';

	// Symbol Table
	coff_symbol16 SymbolTable[NumberOfSymbols] = {
	{{{0, 0, 0, 0, 0, 0, 0, 0}},
	u32(0),
	u16(1),
	u16(0),
	IMAGE_SYM_CLASS_EXTERNAL,
	0},
	{{{'.', 'i', 'd', 'a', 't', 'a', '$', '2'}},
	u32(0),
	u16(1),
	u16(0),
	IMAGE_SYM_CLASS_SECTION,
	0},
	{{{'.', 'i', 'd', 'a', 't', 'a', '$', '6'}},
	u32(0),
	u16(2),
	u16(0),
	IMAGE_SYM_CLASS_STATIC,
	0},
	{{{'.', 'i', 'd', 'a', 't', 'a', '$', '4'}},
	u32(0),
	u16(0),
	u16(0),
	IMAGE_SYM_CLASS_SECTION,
	0},
	{{{'.', 'i', 'd', 'a', 't', 'a', '$', '5'}},
	u32(0),
	u16(0),
	u16(0),
	IMAGE_SYM_CLASS_SECTION,
	0},
	{{{0, 0, 0, 0, 0, 0, 0, 0}},
	u32(0),
	u16(0),
	u16(0),
	IMAGE_SYM_CLASS_EXTERNAL,
	0},
	{{{0, 0, 0, 0, 0, 0, 0, 0}},
	u32(0),
	u16(0),
	u16(0),
	IMAGE_SYM_CLASS_EXTERNAL,
	0},
	};
	// TODO: Name.Offset.Offset here and in the all similar places below
	// suggests a names refactoring. Maybe StringTableOffset.Value?
	SymbolTable[0].Name.Offset.Offset =
	sizeof(uint32_t);
	SymbolTable[5].Name.Offset.Offset =
	sizeof(uint32_t) + ImportDescriptorSymbolName.length() + 1;
	SymbolTable[6].Name.Offset.Offset =
	sizeof(uint32_t) + ImportDescriptorSymbolName.length() + 1 +
	NullImportDescriptorSymbolName.length() + 1;
	append(Buffer, SymbolTable);

	// String Table
	writeStringTable(Buffer,
	{ImportDescriptorSymbolName, NullImportDescriptorSymbolName,
	NullThunkSymbolName});

	StringRef F{reinterpret_cast<const char *>(Buffer.data()), Buffer.size()};
	return {MemoryBufferRef(F, ImportName)};
	}

	NewArchiveMember
	ObjectFactory::createNullImportDescriptor(std::vector<uint8_t> &Buffer) {
	const uint32_t NumberOfSections = 1;
	const uint32_t NumberOfSymbols = 1;

	// COFF Header
	coff_file_header Header{
	u16(Machine),
	u16(NumberOfSections),
	u32(0),
	u32(sizeof(Header) + (NumberOfSections * sizeof(coff_section)) +
	// .idata$3
	sizeof(coff_import_directory_table_entry)),
	u32(NumberOfSymbols),
	u16(0),
	u16(is32bit(Machine) ? IMAGE_FILE_32BIT_MACHINE : 0),
	};
	append(Buffer, Header);

	// Section Header Table
	const coff_section SectionTable[NumberOfSections] = {
	{{'.', 'i', 'd', 'a', 't', 'a', '$', '3'},
	u32(0),
	u32(0),
	u32(sizeof(coff_import_directory_table_entry)),
	u32(sizeof(coff_file_header) +
	(NumberOfSections * sizeof(coff_section))),
	u32(0),
	u32(0),
	u16(0),
	u16(0),
	u32(IMAGE_SCN_ALIGN_4BYTES \| IMAGE_SCN_CNT_INITIALIZED_DATA \|
	IMAGE_SCN_MEM_READ \| IMAGE_SCN_MEM_WRITE)},
	};
	append(Buffer, SectionTable);

	// .idata$3
	const coff_import_directory_table_entry ImportDescriptor{
	u32(0), u32(0), u32(0), u32(0), u32(0),
	};
	append(Buffer, ImportDescriptor);

	// Symbol Table
	coff_symbol16 SymbolTable[NumberOfSymbols] = {
	{{{0, 0, 0, 0, 0, 0, 0, 0}},
	u32(0),
	u16(1),
	u16(0),
	IMAGE_SYM_CLASS_EXTERNAL,
	0},
	};
	SymbolTable[0].Name.Offset.Offset = sizeof(uint32_t);
	append(Buffer, SymbolTable);

	// String Table
	writeStringTable(Buffer, {NullImportDescriptorSymbolName});

	StringRef F{reinterpret_cast<const char *>(Buffer.data()), Buffer.size()};
	return {MemoryBufferRef(F, ImportName)};
	}

	NewArchiveMember ObjectFactory::createNullThunk(std::vector<uint8_t> &Buffer) {
	const uint32_t NumberOfSections = 2;
	const uint32_t NumberOfSymbols = 1;
	uint32_t VASize = is32bit(Machine) ? 4 : 8;

	// COFF Header
	coff_file_header Header{
	u16(Machine),
	u16(NumberOfSections),
	u32(0),
	u32(sizeof(Header) + (NumberOfSections * sizeof(coff_section)) +
	// .idata$5
	VASize +
	// .idata$4
	VASize),
	u32(NumberOfSymbols),
	u16(0),
	u16(is32bit(Machine) ? IMAGE_FILE_32BIT_MACHINE : 0),
	};
	append(Buffer, Header);

	// Section Header Table
	const coff_section SectionTable[NumberOfSections] = {
	{{'.', 'i', 'd', 'a', 't', 'a', '$', '5'},
	u32(0),
	u32(0),
	u32(VASize),
	u32(sizeof(coff_file_header) + NumberOfSections * sizeof(coff_section)),
	u32(0),
	u32(0),
	u16(0),
	u16(0),
	u32((is32bit(Machine) ? IMAGE_SCN_ALIGN_4BYTES
	: IMAGE_SCN_ALIGN_8BYTES) \|
	IMAGE_SCN_CNT_INITIALIZED_DATA \| IMAGE_SCN_MEM_READ \|
	IMAGE_SCN_MEM_WRITE)},
	{{'.', 'i', 'd', 'a', 't', 'a', '$', '4'},
	u32(0),
	u32(0),
	u32(VASize),
	u32(sizeof(coff_file_header) + NumberOfSections * sizeof(coff_section) +
	VASize),
	u32(0),
	u32(0),
	u16(0),
	u16(0),
	u32((is32bit(Machine) ? IMAGE_SCN_ALIGN_4BYTES
	: IMAGE_SCN_ALIGN_8BYTES) \|
	IMAGE_SCN_CNT_INITIALIZED_DATA \| IMAGE_SCN_MEM_READ \|
	IMAGE_SCN_MEM_WRITE)},
	};
	append(Buffer, SectionTable);

	// .idata$5, ILT
	append(Buffer, u32(0));
	if (!is32bit(Machine))
	append(Buffer, u32(0));

	// .idata$4, IAT
	append(Buffer, u32(0));
	if (!is32bit(Machine))
	append(Buffer, u32(0));

	// Symbol Table
	coff_symbol16 SymbolTable[NumberOfSymbols] = {
	{{{0, 0, 0, 0, 0, 0, 0, 0}},
	u32(0),
	u16(1),
	u16(0),
	IMAGE_SYM_CLASS_EXTERNAL,
	0},
	};
	SymbolTable[0].Name.Offset.Offset = sizeof(uint32_t);
	append(Buffer, SymbolTable);

	// String Table
	writeStringTable(Buffer, {NullThunkSymbolName});

	StringRef F{reinterpret_cast<const char *>(Buffer.data()), Buffer.size()};
	return {MemoryBufferRef{F, ImportName}};
	}

	NewArchiveMember ObjectFactory::createShortImport(StringRef Sym,
	uint16_t Ordinal,
	ImportType ImportType,
	ImportNameType NameType) {
	size_t ImpSize = ImportName.size() + Sym.size() + 2; // +2 for NULs
	size_t Size = sizeof(coff_import_header) + ImpSize;
	char *Buf = Alloc.Allocate<char>(Size);
	memset(Buf, 0, Size);
	char *P = Buf;

	// Write short import library.
	auto Imp = reinterpret_cast<coff_import_header >(P);
	P += sizeof(*Imp);
	Imp->Sig2 = 0xFFFF;
	Imp->Machine = Machine;
	Imp->SizeOfData = ImpSize;
	if (Ordinal > 0)
	Imp->OrdinalHint = Ordinal;
	Imp->TypeInfo = (NameType << 2) \| ImportType;

	// Write symbol name and DLL name.
	memcpy(P, Sym.data(), Sym.size());
	P += Sym.size() + 1;
	memcpy(P, ImportName.data(), ImportName.size());

	return {MemoryBufferRef(StringRef(Buf, Size), ImportName)};
	}

	NewArchiveMember ObjectFactory::createWeakExternal(StringRef Sym,
	StringRef Weak, bool Imp) {
	std::vector<uint8_t> Buffer;
	const uint32_t NumberOfSections = 1;
	const uint32_t NumberOfSymbols = 5;

	// COFF Header
	coff_file_header Header{
	u16(0),
	u16(NumberOfSections),
	u32(0),
	u32(sizeof(Header) + (NumberOfSections * sizeof(coff_section))),
	u32(NumberOfSymbols),
	u16(0),
	u16(0),
	};
	append(Buffer, Header);

	// Section Header Table
	const coff_section SectionTable[NumberOfSections] = {
	{{'.', 'd', 'r', 'e', 'c', 't', 'v', 'e'},
	u32(0),
	u32(0),
	u32(0),
	u32(0),
	u32(0),
	u32(0),
	u16(0),
	u16(0),
	u32(IMAGE_SCN_LNK_INFO \| IMAGE_SCN_LNK_REMOVE)}};
	append(Buffer, SectionTable);

	// Symbol Table
	coff_symbol16 SymbolTable[NumberOfSymbols] = {
	{{{'@', 'c', 'o', 'm', 'p', '.', 'i', 'd'}},
	u32(0),
	u16(0xFFFF),
	u16(0),
	IMAGE_SYM_CLASS_STATIC,
	0},
	{{{'@', 'f', 'e', 'a', 't', '.', '0', '0'}},
	u32(0),
	u16(0xFFFF),
	u16(0),
	IMAGE_SYM_CLASS_STATIC,
	0},
	{{{0, 0, 0, 0, 0, 0, 0, 0}},
	u32(0),
	u16(0),
	u16(0),
	IMAGE_SYM_CLASS_EXTERNAL,
	0},
	{{{0, 0, 0, 0, 0, 0, 0, 0}},
	u32(0),
	u16(0),
	u16(0),
	IMAGE_SYM_CLASS_WEAK_EXTERNAL,
	1},
	{{{2, 0, 0, 0, 3, 0, 0, 0}}, u32(0), u16(0), u16(0), uint8_t(0), 0},
	};
	SymbolTable[2].Name.Offset.Offset = sizeof(uint32_t);

	//__imp_ String Table
	StringRef Prefix = Imp ? "__imp_" : "";
	SymbolTable[3].Name.Offset.Offset =
	sizeof(uint32_t) + Sym.size() + Prefix.size() + 1;
	append(Buffer, SymbolTable);
	writeStringTable(Buffer, {(Prefix + Sym).str(),
	(Prefix + Weak).str()});

	// Copied here so we can still use writeStringTable
	char *Buf = Alloc.Allocate<char>(Buffer.size());
	memcpy(Buf, Buffer.data(), Buffer.size());
	return {MemoryBufferRef(StringRef(Buf, Buffer.size()), ImportName)};
	}

	std::error_code writeImportLibrary(StringRef ImportName, StringRef Path,
	ArrayRef<COFFShortExport> Exports,
	- MachineTypes Machine) {
	+ MachineTypes Machine, bool MakeWeakAliases) {

	std::vector<NewArchiveMember> Members;
	ObjectFactory OF(llvm::sys::path::filename(ImportName), Machine);

	std::vector<uint8_t> ImportDescriptor;
	Members.push_back(OF.createImportDescriptor(ImportDescriptor));

	std::vector<uint8_t> NullImportDescriptor;
	Members.push_back(OF.createNullImportDescriptor(NullImportDescriptor));

	std::vector<uint8_t> NullThunk;
	Members.push_back(OF.createNullThunk(NullThunk));

	for (COFFShortExport E : Exports) {
	if (E.Private)
	continue;

	- if (E.isWeak()) {
	+ if (E.isWeak() && MakeWeakAliases) {
	Members.push_back(OF.createWeakExternal(E.Name, E.ExtName, false));
	Members.push_back(OF.createWeakExternal(E.Name, E.ExtName, true));
	continue;
	}

	ImportType ImportType = IMPORT_CODE;
	if (E.Data)
	ImportType = IMPORT_DATA;
	if (E.Constant)
	ImportType = IMPORT_CONST;

	- StringRef SymbolName = E.isWeak() ? E.ExtName : E.Name;
	+ StringRef SymbolName = E.SymbolName.empty() ? E.Name : E.SymbolName;
	ImportNameType NameType = getNameType(SymbolName, E.Name, Machine);
	Expected<std::string> Name = E.ExtName.empty()
	? SymbolName
	: replace(SymbolName, E.Name, E.ExtName);

	if (!Name) {
	return errorToErrorCode(Name.takeError());
	}

	Members.push_back(
	OF.createShortImport(*Name, E.Ordinal, ImportType, NameType));
	}

	std::pair<StringRef, std::error_code> Result =
	writeArchive(Path, Members, /WriteSymtab/ true, object::Archive::K_GNU,
	/Deterministic/ true, /Thin/ false);

	return Result.second;
	}

	} // namespace object
	} // namespace llvm
	diff --git a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
	index 005f2d51e403..9a7f45bde6c9 100644
	--- a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
	+++ b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
	@@ -1,1783 +1,1798 @@
	//=- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -- C++ --=//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file contains a pass that performs load / store related peephole
	// optimizations. This pass should be run after register allocation.
	//
	//===----------------------------------------------------------------------===//

	#include "AArch64InstrInfo.h"
	#include "AArch64Subtarget.h"
	#include "MCTargetDesc/AArch64AddressingModes.h"
	#include "llvm/ADT/BitVector.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/ADT/iterator_range.h"
	#include "llvm/CodeGen/MachineBasicBlock.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineFunctionPass.h"
	#include "llvm/CodeGen/MachineInstr.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineOperand.h"
	#include "llvm/IR/DebugLoc.h"
	#include "llvm/MC/MCRegisterInfo.h"
	#include "llvm/Pass.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Target/TargetRegisterInfo.h"
	#include <cassert>
	#include <cstdint>
	#include <iterator>
	#include <limits>

	using namespace llvm;

	#define DEBUG_TYPE "aarch64-ldst-opt"

	STATISTIC(NumPairCreated, "Number of load/store pair instructions generated");
	STATISTIC(NumPostFolded, "Number of post-index updates folded");
	STATISTIC(NumPreFolded, "Number of pre-index updates folded");
	STATISTIC(NumUnscaledPairCreated,
	"Number of load/store from unscaled generated");
	STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
	STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted");

	// The LdStLimit limits how far we search for load/store pairs.
	static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit",
	cl::init(20), cl::Hidden);

	// The UpdateLimit limits how far we search for update instructions when we form
	// pre-/post-index instructions.
	static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100),
	cl::Hidden);

	#define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass"

	namespace {

	typedef struct LdStPairFlags {
	// If a matching instruction is found, MergeForward is set to true if the
	// merge is to remove the first instruction and replace the second with
	// a pair-wise insn, and false if the reverse is true.
	bool MergeForward = false;

	// SExtIdx gives the index of the result of the load pair that must be
	// extended. The value of SExtIdx assumes that the paired load produces the
	// value in this order: (I, returned iterator), i.e., -1 means no value has
	// to be extended, 0 means I, and 1 means the returned iterator.
	int SExtIdx = -1;

	LdStPairFlags() = default;

	void setMergeForward(bool V = true) { MergeForward = V; }
	bool getMergeForward() const { return MergeForward; }

	void setSExtIdx(int V) { SExtIdx = V; }
	int getSExtIdx() const { return SExtIdx; }

	} LdStPairFlags;

	struct AArch64LoadStoreOpt : public MachineFunctionPass {
	static char ID;

	AArch64LoadStoreOpt() : MachineFunctionPass(ID) {
	initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry());
	}

	AliasAnalysis *AA;
	const AArch64InstrInfo *TII;
	const TargetRegisterInfo *TRI;
	const AArch64Subtarget *Subtarget;

	// Track which registers have been modified and used.
	BitVector ModifiedRegs, UsedRegs;

	virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
	AU.addRequired<AAResultsWrapperPass>();
	MachineFunctionPass::getAnalysisUsage(AU);
	}

	// Scan the instructions looking for a load/store that can be combined
	// with the current instruction into a load/store pair.
	// Return the matching instruction if one is found, else MBB->end().
	MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
	LdStPairFlags &Flags,
	unsigned Limit,
	bool FindNarrowMerge);

	// Scan the instructions looking for a store that writes to the address from
	// which the current load instruction reads. Return true if one is found.
	bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit,
	MachineBasicBlock::iterator &StoreI);

	// Merge the two instructions indicated into a wider narrow store instruction.
	MachineBasicBlock::iterator
	mergeNarrowZeroStores(MachineBasicBlock::iterator I,
	MachineBasicBlock::iterator MergeMI,
	const LdStPairFlags &Flags);

	// Merge the two instructions indicated into a single pair-wise instruction.
	MachineBasicBlock::iterator
	mergePairedInsns(MachineBasicBlock::iterator I,
	MachineBasicBlock::iterator Paired,
	const LdStPairFlags &Flags);

	// Promote the load that reads directly from the address stored to.
	MachineBasicBlock::iterator
	promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
	MachineBasicBlock::iterator StoreI);

	// Scan the instruction list to find a base register update that can
	// be combined with the current instruction (a load or store) using
	// pre or post indexed addressing with writeback. Scan forwards.
	MachineBasicBlock::iterator
	findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,
	int UnscaledOffset, unsigned Limit);

	// Scan the instruction list to find a base register update that can
	// be combined with the current instruction (a load or store) using
	// pre or post indexed addressing with writeback. Scan backwards.
	MachineBasicBlock::iterator
	findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit);

	// Find an instruction that updates the base register of the ld/st
	// instruction.
	bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI,
	unsigned BaseReg, int Offset);

	// Merge a pre- or post-index base register update into a ld/st instruction.
	MachineBasicBlock::iterator
	mergeUpdateInsn(MachineBasicBlock::iterator I,
	MachineBasicBlock::iterator Update, bool IsPreIdx);

	// Find and merge zero store instructions.
	bool tryToMergeZeroStInst(MachineBasicBlock::iterator &MBBI);

	// Find and pair ldr/str instructions.
	bool tryToPairLdStInst(MachineBasicBlock::iterator &MBBI);

	// Find and promote load instructions which read directly from store.
	bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI);

	bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);

	bool runOnMachineFunction(MachineFunction &Fn) override;

	MachineFunctionProperties getRequiredProperties() const override {
	return MachineFunctionProperties().set(
	MachineFunctionProperties::Property::NoVRegs);
	}

	StringRef getPassName() const override { return AARCH64_LOAD_STORE_OPT_NAME; }
	};

	char AArch64LoadStoreOpt::ID = 0;

	} // end anonymous namespace

	INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt",
	AARCH64_LOAD_STORE_OPT_NAME, false, false)

	static bool isNarrowStore(unsigned Opc) {
	switch (Opc) {
	default:
	return false;
	case AArch64::STRBBui:
	case AArch64::STURBBi:
	case AArch64::STRHHui:
	case AArch64::STURHHi:
	return true;
	}
	}

	// Scaling factor for unscaled load or store.
	static int getMemScale(MachineInstr &MI) {
	switch (MI.getOpcode()) {
	default:
	llvm_unreachable("Opcode has unknown scale!");
	case AArch64::LDRBBui:
	case AArch64::LDURBBi:
	case AArch64::LDRSBWui:
	case AArch64::LDURSBWi:
	case AArch64::STRBBui:
	case AArch64::STURBBi:
	return 1;
	case AArch64::LDRHHui:
	case AArch64::LDURHHi:
	case AArch64::LDRSHWui:
	case AArch64::LDURSHWi:
	case AArch64::STRHHui:
	case AArch64::STURHHi:
	return 2;
	case AArch64::LDRSui:
	case AArch64::LDURSi:
	case AArch64::LDRSWui:
	case AArch64::LDURSWi:
	case AArch64::LDRWui:
	case AArch64::LDURWi:
	case AArch64::STRSui:
	case AArch64::STURSi:
	case AArch64::STRWui:
	case AArch64::STURWi:
	case AArch64::LDPSi:
	case AArch64::LDPSWi:
	case AArch64::LDPWi:
	case AArch64::STPSi:
	case AArch64::STPWi:
	return 4;
	case AArch64::LDRDui:
	case AArch64::LDURDi:
	case AArch64::LDRXui:
	case AArch64::LDURXi:
	case AArch64::STRDui:
	case AArch64::STURDi:
	case AArch64::STRXui:
	case AArch64::STURXi:
	case AArch64::LDPDi:
	case AArch64::LDPXi:
	case AArch64::STPDi:
	case AArch64::STPXi:
	return 8;
	case AArch64::LDRQui:
	case AArch64::LDURQi:
	case AArch64::STRQui:
	case AArch64::STURQi:
	case AArch64::LDPQi:
	case AArch64::STPQi:
	return 16;
	}
	}

	static unsigned getMatchingNonSExtOpcode(unsigned Opc,
	bool *IsValidLdStrOpc = nullptr) {
	if (IsValidLdStrOpc)
	*IsValidLdStrOpc = true;
	switch (Opc) {
	default:
	if (IsValidLdStrOpc)
	*IsValidLdStrOpc = false;
	return std::numeric_limits<unsigned>::max();
	case AArch64::STRDui:
	case AArch64::STURDi:
	case AArch64::STRQui:
	case AArch64::STURQi:
	case AArch64::STRBBui:
	case AArch64::STURBBi:
	case AArch64::STRHHui:
	case AArch64::STURHHi:
	case AArch64::STRWui:
	case AArch64::STURWi:
	case AArch64::STRXui:
	case AArch64::STURXi:
	case AArch64::LDRDui:
	case AArch64::LDURDi:
	case AArch64::LDRQui:
	case AArch64::LDURQi:
	case AArch64::LDRWui:
	case AArch64::LDURWi:
	case AArch64::LDRXui:
	case AArch64::LDURXi:
	case AArch64::STRSui:
	case AArch64::STURSi:
	case AArch64::LDRSui:
	case AArch64::LDURSi:
	return Opc;
	case AArch64::LDRSWui:
	return AArch64::LDRWui;
	case AArch64::LDURSWi:
	return AArch64::LDURWi;
	}
	}

	static unsigned getMatchingWideOpcode(unsigned Opc) {
	switch (Opc) {
	default:
	llvm_unreachable("Opcode has no wide equivalent!");
	case AArch64::STRBBui:
	return AArch64::STRHHui;
	case AArch64::STRHHui:
	return AArch64::STRWui;
	case AArch64::STURBBi:
	return AArch64::STURHHi;
	case AArch64::STURHHi:
	return AArch64::STURWi;
	case AArch64::STURWi:
	return AArch64::STURXi;
	case AArch64::STRWui:
	return AArch64::STRXui;
	}
	}

	static unsigned getMatchingPairOpcode(unsigned Opc) {
	switch (Opc) {
	default:
	llvm_unreachable("Opcode has no pairwise equivalent!");
	case AArch64::STRSui:
	case AArch64::STURSi:
	return AArch64::STPSi;
	case AArch64::STRDui:
	case AArch64::STURDi:
	return AArch64::STPDi;
	case AArch64::STRQui:
	case AArch64::STURQi:
	return AArch64::STPQi;
	case AArch64::STRWui:
	case AArch64::STURWi:
	return AArch64::STPWi;
	case AArch64::STRXui:
	case AArch64::STURXi:
	return AArch64::STPXi;
	case AArch64::LDRSui:
	case AArch64::LDURSi:
	return AArch64::LDPSi;
	case AArch64::LDRDui:
	case AArch64::LDURDi:
	return AArch64::LDPDi;
	case AArch64::LDRQui:
	case AArch64::LDURQi:
	return AArch64::LDPQi;
	case AArch64::LDRWui:
	case AArch64::LDURWi:
	return AArch64::LDPWi;
	case AArch64::LDRXui:
	case AArch64::LDURXi:
	return AArch64::LDPXi;
	case AArch64::LDRSWui:
	case AArch64::LDURSWi:
	return AArch64::LDPSWi;
	}
	}

	static unsigned isMatchingStore(MachineInstr &LoadInst,
	MachineInstr &StoreInst) {
	unsigned LdOpc = LoadInst.getOpcode();
	unsigned StOpc = StoreInst.getOpcode();
	switch (LdOpc) {
	default:
	llvm_unreachable("Unsupported load instruction!");
	case AArch64::LDRBBui:
	return StOpc == AArch64::STRBBui \|\| StOpc == AArch64::STRHHui \|\|
	StOpc == AArch64::STRWui \|\| StOpc == AArch64::STRXui;
	case AArch64::LDURBBi:
	return StOpc == AArch64::STURBBi \|\| StOpc == AArch64::STURHHi \|\|
	StOpc == AArch64::STURWi \|\| StOpc == AArch64::STURXi;
	case AArch64::LDRHHui:
	return StOpc == AArch64::STRHHui \|\| StOpc == AArch64::STRWui \|\|
	StOpc == AArch64::STRXui;
	case AArch64::LDURHHi:
	return StOpc == AArch64::STURHHi \|\| StOpc == AArch64::STURWi \|\|
	StOpc == AArch64::STURXi;
	case AArch64::LDRWui:
	return StOpc == AArch64::STRWui \|\| StOpc == AArch64::STRXui;
	case AArch64::LDURWi:
	return StOpc == AArch64::STURWi \|\| StOpc == AArch64::STURXi;
	case AArch64::LDRXui:
	return StOpc == AArch64::STRXui;
	case AArch64::LDURXi:
	return StOpc == AArch64::STURXi;
	}
	}

	static unsigned getPreIndexedOpcode(unsigned Opc) {
	+ // FIXME: We don't currently support creating pre-indexed loads/stores when
	+ // the load or store is the unscaled version. If we decide to perform such an
	+ // optimization in the future the cases for the unscaled loads/stores will
	+ // need to be added here.
	switch (Opc) {
	default:
	llvm_unreachable("Opcode has no pre-indexed equivalent!");
	case AArch64::STRSui:
	return AArch64::STRSpre;
	case AArch64::STRDui:
	return AArch64::STRDpre;
	case AArch64::STRQui:
	return AArch64::STRQpre;
	case AArch64::STRBBui:
	return AArch64::STRBBpre;
	case AArch64::STRHHui:
	return AArch64::STRHHpre;
	case AArch64::STRWui:
	return AArch64::STRWpre;
	case AArch64::STRXui:
	return AArch64::STRXpre;
	case AArch64::LDRSui:
	return AArch64::LDRSpre;
	case AArch64::LDRDui:
	return AArch64::LDRDpre;
	case AArch64::LDRQui:
	return AArch64::LDRQpre;
	case AArch64::LDRBBui:
	return AArch64::LDRBBpre;
	case AArch64::LDRHHui:
	return AArch64::LDRHHpre;
	case AArch64::LDRWui:
	return AArch64::LDRWpre;
	case AArch64::LDRXui:
	return AArch64::LDRXpre;
	case AArch64::LDRSWui:
	return AArch64::LDRSWpre;
	case AArch64::LDPSi:
	return AArch64::LDPSpre;
	case AArch64::LDPSWi:
	return AArch64::LDPSWpre;
	case AArch64::LDPDi:
	return AArch64::LDPDpre;
	case AArch64::LDPQi:
	return AArch64::LDPQpre;
	case AArch64::LDPWi:
	return AArch64::LDPWpre;
	case AArch64::LDPXi:
	return AArch64::LDPXpre;
	case AArch64::STPSi:
	return AArch64::STPSpre;
	case AArch64::STPDi:
	return AArch64::STPDpre;
	case AArch64::STPQi:
	return AArch64::STPQpre;
	case AArch64::STPWi:
	return AArch64::STPWpre;
	case AArch64::STPXi:
	return AArch64::STPXpre;
	}
	}

	static unsigned getPostIndexedOpcode(unsigned Opc) {
	switch (Opc) {
	default:
	llvm_unreachable("Opcode has no post-indexed wise equivalent!");
	case AArch64::STRSui:
	+ case AArch64::STURSi:
	return AArch64::STRSpost;
	case AArch64::STRDui:
	+ case AArch64::STURDi:
	return AArch64::STRDpost;
	case AArch64::STRQui:
	+ case AArch64::STURQi:
	return AArch64::STRQpost;
	case AArch64::STRBBui:
	return AArch64::STRBBpost;
	case AArch64::STRHHui:
	return AArch64::STRHHpost;
	case AArch64::STRWui:
	+ case AArch64::STURWi:
	return AArch64::STRWpost;
	case AArch64::STRXui:
	+ case AArch64::STURXi:
	return AArch64::STRXpost;
	case AArch64::LDRSui:
	+ case AArch64::LDURSi:
	return AArch64::LDRSpost;
	case AArch64::LDRDui:
	+ case AArch64::LDURDi:
	return AArch64::LDRDpost;
	case AArch64::LDRQui:
	+ case AArch64::LDURQi:
	return AArch64::LDRQpost;
	case AArch64::LDRBBui:
	return AArch64::LDRBBpost;
	case AArch64::LDRHHui:
	return AArch64::LDRHHpost;
	case AArch64::LDRWui:
	+ case AArch64::LDURWi:
	return AArch64::LDRWpost;
	case AArch64::LDRXui:
	+ case AArch64::LDURXi:
	return AArch64::LDRXpost;
	case AArch64::LDRSWui:
	return AArch64::LDRSWpost;
	case AArch64::LDPSi:
	return AArch64::LDPSpost;
	case AArch64::LDPSWi:
	return AArch64::LDPSWpost;
	case AArch64::LDPDi:
	return AArch64::LDPDpost;
	case AArch64::LDPQi:
	return AArch64::LDPQpost;
	case AArch64::LDPWi:
	return AArch64::LDPWpost;
	case AArch64::LDPXi:
	return AArch64::LDPXpost;
	case AArch64::STPSi:
	return AArch64::STPSpost;
	case AArch64::STPDi:
	return AArch64::STPDpost;
	case AArch64::STPQi:
	return AArch64::STPQpost;
	case AArch64::STPWi:
	return AArch64::STPWpost;
	case AArch64::STPXi:
	return AArch64::STPXpost;
	}
	}

	static bool isPairedLdSt(const MachineInstr &MI) {
	switch (MI.getOpcode()) {
	default:
	return false;
	case AArch64::LDPSi:
	case AArch64::LDPSWi:
	case AArch64::LDPDi:
	case AArch64::LDPQi:
	case AArch64::LDPWi:
	case AArch64::LDPXi:
	case AArch64::STPSi:
	case AArch64::STPDi:
	case AArch64::STPQi:
	case AArch64::STPWi:
	case AArch64::STPXi:
	return true;
	}
	}

	static const MachineOperand &getLdStRegOp(const MachineInstr &MI,
	unsigned PairedRegOp = 0) {
	assert(PairedRegOp < 2 && "Unexpected register operand idx.");
	unsigned Idx = isPairedLdSt(MI) ? PairedRegOp : 0;
	return MI.getOperand(Idx);
	}

	static const MachineOperand &getLdStBaseOp(const MachineInstr &MI) {
	unsigned Idx = isPairedLdSt(MI) ? 2 : 1;
	return MI.getOperand(Idx);
	}

	static const MachineOperand &getLdStOffsetOp(const MachineInstr &MI) {
	unsigned Idx = isPairedLdSt(MI) ? 3 : 2;
	return MI.getOperand(Idx);
	}

	static bool isLdOffsetInRangeOfSt(MachineInstr &LoadInst,
	MachineInstr &StoreInst,
	const AArch64InstrInfo *TII) {
	assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st.");
	int LoadSize = getMemScale(LoadInst);
	int StoreSize = getMemScale(StoreInst);
	int UnscaledStOffset = TII->isUnscaledLdSt(StoreInst)
	? getLdStOffsetOp(StoreInst).getImm()
	: getLdStOffsetOp(StoreInst).getImm() * StoreSize;
	int UnscaledLdOffset = TII->isUnscaledLdSt(LoadInst)
	? getLdStOffsetOp(LoadInst).getImm()
	: getLdStOffsetOp(LoadInst).getImm() * LoadSize;
	return (UnscaledStOffset <= UnscaledLdOffset) &&
	(UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize));
	}

	static bool isPromotableZeroStoreInst(MachineInstr &MI) {
	unsigned Opc = MI.getOpcode();
	return (Opc == AArch64::STRWui \|\| Opc == AArch64::STURWi \|\|
	isNarrowStore(Opc)) &&
	getLdStRegOp(MI).getReg() == AArch64::WZR;
	}

	MachineBasicBlock::iterator
	AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I,
	MachineBasicBlock::iterator MergeMI,
	const LdStPairFlags &Flags) {
	assert(isPromotableZeroStoreInst(I) && isPromotableZeroStoreInst(MergeMI) &&
	"Expected promotable zero stores.");

	MachineBasicBlock::iterator NextI = I;
	++NextI;
	// If NextI is the second of the two instructions to be merged, we need
	// to skip one further. Either way we merge will invalidate the iterator,
	// and we don't need to scan the new instruction, as it's a pairwise
	// instruction, which we're not considering for further action anyway.
	if (NextI == MergeMI)
	++NextI;

	unsigned Opc = I->getOpcode();
	bool IsScaled = !TII->isUnscaledLdSt(Opc);
	int OffsetStride = IsScaled ? 1 : getMemScale(*I);

	bool MergeForward = Flags.getMergeForward();
	// Insert our new paired instruction after whichever of the paired
	// instructions MergeForward indicates.
	MachineBasicBlock::iterator InsertionPoint = MergeForward ? MergeMI : I;
	// Also based on MergeForward is from where we copy the base register operand
	// so we get the flags compatible with the input code.
	const MachineOperand &BaseRegOp =
	MergeForward ? getLdStBaseOp(MergeMI) : getLdStBaseOp(I);

	// Which register is Rt and which is Rt2 depends on the offset order.
	MachineInstr *RtMI;
	if (getLdStOffsetOp(*I).getImm() ==
	getLdStOffsetOp(*MergeMI).getImm() + OffsetStride)
	RtMI = &*MergeMI;
	else
	RtMI = &*I;

	int OffsetImm = getLdStOffsetOp(*RtMI).getImm();
	// Change the scaled offset from small to large type.
	if (IsScaled) {
	assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge");
	OffsetImm /= 2;
	}

	// Construct the new instruction.
	DebugLoc DL = I->getDebugLoc();
	MachineBasicBlock *MBB = I->getParent();
	MachineInstrBuilder MIB;
	MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc)))
	.addReg(isNarrowStore(Opc) ? AArch64::WZR : AArch64::XZR)
	.add(BaseRegOp)
	.addImm(OffsetImm)
	.setMemRefs(I->mergeMemRefsWith(*MergeMI));
	(void)MIB;

	DEBUG(dbgs() << "Creating wider store. Replacing instructions:\n ");
	DEBUG(I->print(dbgs()));
	DEBUG(dbgs() << " ");
	DEBUG(MergeMI->print(dbgs()));
	DEBUG(dbgs() << " with instruction:\n ");
	DEBUG(((MachineInstr *)MIB)->print(dbgs()));
	DEBUG(dbgs() << "\n");

	// Erase the old instructions.
	I->eraseFromParent();
	MergeMI->eraseFromParent();
	return NextI;
	}

	MachineBasicBlock::iterator
	AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
	MachineBasicBlock::iterator Paired,
	const LdStPairFlags &Flags) {
	MachineBasicBlock::iterator NextI = I;
	++NextI;
	// If NextI is the second of the two instructions to be merged, we need
	// to skip one further. Either way we merge will invalidate the iterator,
	// and we don't need to scan the new instruction, as it's a pairwise
	// instruction, which we're not considering for further action anyway.
	if (NextI == Paired)
	++NextI;

	int SExtIdx = Flags.getSExtIdx();
	unsigned Opc =
	SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode());
	bool IsUnscaled = TII->isUnscaledLdSt(Opc);
	int OffsetStride = IsUnscaled ? getMemScale(*I) : 1;

	bool MergeForward = Flags.getMergeForward();
	// Insert our new paired instruction after whichever of the paired
	// instructions MergeForward indicates.
	MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I;
	// Also based on MergeForward is from where we copy the base register operand
	// so we get the flags compatible with the input code.
	const MachineOperand &BaseRegOp =
	MergeForward ? getLdStBaseOp(Paired) : getLdStBaseOp(I);

	int Offset = getLdStOffsetOp(*I).getImm();
	int PairedOffset = getLdStOffsetOp(*Paired).getImm();
	bool PairedIsUnscaled = TII->isUnscaledLdSt(Paired->getOpcode());
	if (IsUnscaled != PairedIsUnscaled) {
	// We're trying to pair instructions that differ in how they are scaled. If
	// I is scaled then scale the offset of Paired accordingly. Otherwise, do
	// the opposite (i.e., make Paired's offset unscaled).
	int MemSize = getMemScale(*Paired);
	if (PairedIsUnscaled) {
	// If the unscaled offset isn't a multiple of the MemSize, we can't
	// pair the operations together.
	assert(!(PairedOffset % getMemScale(*Paired)) &&
	"Offset should be a multiple of the stride!");
	PairedOffset /= MemSize;
	} else {
	PairedOffset *= MemSize;
	}
	}

	// Which register is Rt and which is Rt2 depends on the offset order.
	MachineInstr RtMI, Rt2MI;
	if (Offset == PairedOffset + OffsetStride) {
	RtMI = &*Paired;
	Rt2MI = &*I;
	// Here we swapped the assumption made for SExtIdx.
	// I.e., we turn ldp I, Paired into ldp Paired, I.
	// Update the index accordingly.
	if (SExtIdx != -1)
	SExtIdx = (SExtIdx + 1) % 2;
	} else {
	RtMI = &*I;
	Rt2MI = &*Paired;
	}
	int OffsetImm = getLdStOffsetOp(*RtMI).getImm();
	// Scale the immediate offset, if necessary.
	if (TII->isUnscaledLdSt(RtMI->getOpcode())) {
	assert(!(OffsetImm % getMemScale(*RtMI)) &&
	"Unscaled offset cannot be scaled.");
	OffsetImm /= getMemScale(*RtMI);
	}

	// Construct the new instruction.
	MachineInstrBuilder MIB;
	DebugLoc DL = I->getDebugLoc();
	MachineBasicBlock *MBB = I->getParent();
	MachineOperand RegOp0 = getLdStRegOp(*RtMI);
	MachineOperand RegOp1 = getLdStRegOp(*Rt2MI);
	// Kill flags may become invalid when moving stores for pairing.
	if (RegOp0.isUse()) {
	if (!MergeForward) {
	// Clear kill flags on store if moving upwards. Example:
	// STRWui %w0, ...
	// USE %w1
	// STRWui kill %w1 ; need to clear kill flag when moving STRWui upwards
	RegOp0.setIsKill(false);
	RegOp1.setIsKill(false);
	} else {
	// Clear kill flags of the first stores register. Example:
	// STRWui %w1, ...
	// USE kill %w1 ; need to clear kill flag when moving STRWui downwards
	// STRW %w0
	unsigned Reg = getLdStRegOp(*I).getReg();
	for (MachineInstr &MI : make_range(std::next(I), Paired))
	MI.clearRegisterKills(Reg, TRI);
	}
	}
	MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingPairOpcode(Opc)))
	.add(RegOp0)
	.add(RegOp1)
	.add(BaseRegOp)
	.addImm(OffsetImm)
	.setMemRefs(I->mergeMemRefsWith(*Paired));

	(void)MIB;

	DEBUG(dbgs() << "Creating pair load/store. Replacing instructions:\n ");
	DEBUG(I->print(dbgs()));
	DEBUG(dbgs() << " ");
	DEBUG(Paired->print(dbgs()));
	DEBUG(dbgs() << " with instruction:\n ");
	if (SExtIdx != -1) {
	// Generate the sign extension for the proper result of the ldp.
	// I.e., with X1, that would be:
	// %W1<def> = KILL %W1, %X1<imp-def>
	// %X1<def> = SBFMXri %X1<kill>, 0, 31
	MachineOperand &DstMO = MIB->getOperand(SExtIdx);
	// Right now, DstMO has the extended register, since it comes from an
	// extended opcode.
	unsigned DstRegX = DstMO.getReg();
	// Get the W variant of that register.
	unsigned DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32);
	// Update the result of LDP to use the W instead of the X variant.
	DstMO.setReg(DstRegW);
	DEBUG(((MachineInstr *)MIB)->print(dbgs()));
	DEBUG(dbgs() << "\n");
	// Make the machine verifier happy by providing a definition for
	// the X register.
	// Insert this definition right after the generated LDP, i.e., before
	// InsertionPoint.
	MachineInstrBuilder MIBKill =
	BuildMI(*MBB, InsertionPoint, DL, TII->get(TargetOpcode::KILL), DstRegW)
	.addReg(DstRegW)
	.addReg(DstRegX, RegState::Define);
	MIBKill->getOperand(2).setImplicit();
	// Create the sign extension.
	MachineInstrBuilder MIBSXTW =
	BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::SBFMXri), DstRegX)
	.addReg(DstRegX)
	.addImm(0)
	.addImm(31);
	(void)MIBSXTW;
	DEBUG(dbgs() << " Extend operand:\n ");
	DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs()));
	} else {
	DEBUG(((MachineInstr *)MIB)->print(dbgs()));
	}
	DEBUG(dbgs() << "\n");

	// Erase the old instructions.
	I->eraseFromParent();
	Paired->eraseFromParent();

	return NextI;
	}

	MachineBasicBlock::iterator
	AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
	MachineBasicBlock::iterator StoreI) {
	MachineBasicBlock::iterator NextI = LoadI;
	++NextI;

	int LoadSize = getMemScale(*LoadI);
	int StoreSize = getMemScale(*StoreI);
	unsigned LdRt = getLdStRegOp(*LoadI).getReg();
	const MachineOperand &StMO = getLdStRegOp(*StoreI);
	unsigned StRt = getLdStRegOp(*StoreI).getReg();
	bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt);

	assert((IsStoreXReg \|\|
	TRI->getRegClass(AArch64::GPR32RegClassID)->contains(StRt)) &&
	"Unexpected RegClass");

	MachineInstr *BitExtMI;
	if (LoadSize == StoreSize && (LoadSize == 4 \|\| LoadSize == 8)) {
	// Remove the load, if the destination register of the loads is the same
	// register for stored value.
	if (StRt == LdRt && LoadSize == 8) {
	for (MachineInstr &MI : make_range(StoreI->getIterator(),
	LoadI->getIterator())) {
	if (MI.killsRegister(StRt, TRI)) {
	MI.clearRegisterKills(StRt, TRI);
	break;
	}
	}
	DEBUG(dbgs() << "Remove load instruction:\n ");
	DEBUG(LoadI->print(dbgs()));
	DEBUG(dbgs() << "\n");
	LoadI->eraseFromParent();
	return NextI;
	}
	// Replace the load with a mov if the load and store are in the same size.
	BitExtMI =
	BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
	TII->get(IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), LdRt)
	.addReg(IsStoreXReg ? AArch64::XZR : AArch64::WZR)
	.add(StMO)
	.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
	} else {
	// FIXME: Currently we disable this transformation in big-endian targets as
	// performance and correctness are verified only in little-endian.
	if (!Subtarget->isLittleEndian())
	return NextI;
	bool IsUnscaled = TII->isUnscaledLdSt(*LoadI);
	assert(IsUnscaled == TII->isUnscaledLdSt(*StoreI) &&
	"Unsupported ld/st match");
	assert(LoadSize <= StoreSize && "Invalid load size");
	int UnscaledLdOffset = IsUnscaled
	? getLdStOffsetOp(*LoadI).getImm()
	: getLdStOffsetOp(LoadI).getImm() LoadSize;
	int UnscaledStOffset = IsUnscaled
	? getLdStOffsetOp(*StoreI).getImm()
	: getLdStOffsetOp(StoreI).getImm() StoreSize;
	int Width = LoadSize * 8;
	int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
	int Imms = Immr + Width - 1;
	unsigned DestReg = IsStoreXReg
	? TRI->getMatchingSuperReg(LdRt, AArch64::sub_32,
	&AArch64::GPR64RegClass)
	: LdRt;

	assert((UnscaledLdOffset >= UnscaledStOffset &&
	(UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) &&
	"Invalid offset");

	Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
	Imms = Immr + Width - 1;
	if (UnscaledLdOffset == UnscaledStOffset) {
	uint32_t AndMaskEncoded = ((IsStoreXReg ? 1 : 0) << 12) // N
	\| ((Immr) << 6) // immr
	\| ((Imms) << 0) // imms
	;

	BitExtMI =
	BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
	TII->get(IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri),
	DestReg)
	.add(StMO)
	.addImm(AndMaskEncoded);
	} else {
	BitExtMI =
	BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
	TII->get(IsStoreXReg ? AArch64::UBFMXri : AArch64::UBFMWri),
	DestReg)
	.add(StMO)
	.addImm(Immr)
	.addImm(Imms);
	}
	}

	// Clear kill flags between store and load.
	for (MachineInstr &MI : make_range(StoreI->getIterator(),
	BitExtMI->getIterator()))
	if (MI.killsRegister(StRt, TRI)) {
	MI.clearRegisterKills(StRt, TRI);
	break;
	}

	DEBUG(dbgs() << "Promoting load by replacing :\n ");
	DEBUG(StoreI->print(dbgs()));
	DEBUG(dbgs() << " ");
	DEBUG(LoadI->print(dbgs()));
	DEBUG(dbgs() << " with instructions:\n ");
	DEBUG(StoreI->print(dbgs()));
	DEBUG(dbgs() << " ");
	DEBUG((BitExtMI)->print(dbgs()));
	DEBUG(dbgs() << "\n");

	// Erase the old instructions.
	LoadI->eraseFromParent();
	return NextI;
	}

	/// trackRegDefsUses - Remember what registers the specified instruction uses
	/// and modifies.
	static void trackRegDefsUses(const MachineInstr &MI, BitVector &ModifiedRegs,
	BitVector &UsedRegs,
	const TargetRegisterInfo *TRI) {
	for (const MachineOperand &MO : MI.operands()) {
	if (MO.isRegMask())
	ModifiedRegs.setBitsNotInMask(MO.getRegMask());

	if (!MO.isReg())
	continue;
	unsigned Reg = MO.getReg();
	if (!Reg)
	continue;
	if (MO.isDef()) {
	// WZR/XZR are not modified even when used as a destination register.
	if (Reg != AArch64::WZR && Reg != AArch64::XZR)
	for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
	ModifiedRegs.set(*AI);
	} else {
	assert(MO.isUse() && "Reg operand not a def and not a use?!?");
	for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
	UsedRegs.set(*AI);
	}
	}
	}

	static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
	// Convert the byte-offset used by unscaled into an "element" offset used
	// by the scaled pair load/store instructions.
	if (IsUnscaled) {
	// If the byte-offset isn't a multiple of the stride, there's no point
	// trying to match it.
	if (Offset % OffsetStride)
	return false;
	Offset /= OffsetStride;
	}
	return Offset <= 63 && Offset >= -64;
	}

	// Do alignment, specialized to power of 2 and for signed ints,
	// avoiding having to do a C-style cast from uint_64t to int when
	// using alignTo from include/llvm/Support/MathExtras.h.
	// FIXME: Move this function to include/MathExtras.h?
	static int alignTo(int Num, int PowOf2) {
	return (Num + PowOf2 - 1) & ~(PowOf2 - 1);
	}

	static bool mayAlias(MachineInstr &MIa, MachineInstr &MIb,
	AliasAnalysis *AA) {
	// One of the instructions must modify memory.
	if (!MIa.mayStore() && !MIb.mayStore())
	return false;

	// Both instructions must be memory operations.
	if (!MIa.mayLoadOrStore() && !MIb.mayLoadOrStore())
	return false;

	return MIa.mayAlias(AA, MIb, /UseTBAA/false);
	}

	static bool mayAlias(MachineInstr &MIa,
	SmallVectorImpl<MachineInstr *> &MemInsns,
	AliasAnalysis *AA) {
	for (MachineInstr *MIb : MemInsns)
	if (mayAlias(MIa, *MIb, AA))
	return true;

	return false;
	}

	bool AArch64LoadStoreOpt::findMatchingStore(
	MachineBasicBlock::iterator I, unsigned Limit,
	MachineBasicBlock::iterator &StoreI) {
	MachineBasicBlock::iterator B = I->getParent()->begin();
	MachineBasicBlock::iterator MBBI = I;
	MachineInstr &LoadMI = *I;
	unsigned BaseReg = getLdStBaseOp(LoadMI).getReg();

	// If the load is the first instruction in the block, there's obviously
	// not any matching store.
	if (MBBI == B)
	return false;

	// Track which registers have been modified and used between the first insn
	// and the second insn.
	ModifiedRegs.reset();
	UsedRegs.reset();

	unsigned Count = 0;
	do {
	--MBBI;
	MachineInstr &MI = *MBBI;

	// Don't count transient instructions towards the search limit since there
	// may be different numbers of them if e.g. debug information is present.
	if (!MI.isTransient())
	++Count;

	// If the load instruction reads directly from the address to which the
	// store instruction writes and the stored value is not modified, we can
	// promote the load. Since we do not handle stores with pre-/post-index,
	// it's unnecessary to check if BaseReg is modified by the store itself.
	if (MI.mayStore() && isMatchingStore(LoadMI, MI) &&
	BaseReg == getLdStBaseOp(MI).getReg() &&
	isLdOffsetInRangeOfSt(LoadMI, MI, TII) &&
	!ModifiedRegs[getLdStRegOp(MI).getReg()]) {
	StoreI = MBBI;
	return true;
	}

	if (MI.isCall())
	return false;

	// Update modified / uses register lists.
	trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);

	// Otherwise, if the base register is modified, we have no match, so
	// return early.
	if (ModifiedRegs[BaseReg])
	return false;

	// If we encounter a store aliased with the load, return early.
	if (MI.mayStore() && mayAlias(LoadMI, MI, AA))
	return false;
	} while (MBBI != B && Count < Limit);
	return false;
	}

	// Returns true if FirstMI and MI are candidates for merging or pairing.
	// Otherwise, returns false.
	static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI,
	LdStPairFlags &Flags,
	const AArch64InstrInfo *TII) {
	// If this is volatile or if pairing is suppressed, not a candidate.
	if (MI.hasOrderedMemoryRef() \|\| TII->isLdStPairSuppressed(MI))
	return false;

	// We should have already checked FirstMI for pair suppression and volatility.
	assert(!FirstMI.hasOrderedMemoryRef() &&
	!TII->isLdStPairSuppressed(FirstMI) &&
	"FirstMI shouldn't get here if either of these checks are true.");

	unsigned OpcA = FirstMI.getOpcode();
	unsigned OpcB = MI.getOpcode();

	// Opcodes match: nothing more to check.
	if (OpcA == OpcB)
	return true;

	// Try to match a sign-extended load/store with a zero-extended load/store.
	bool IsValidLdStrOpc, PairIsValidLdStrOpc;
	unsigned NonSExtOpc = getMatchingNonSExtOpcode(OpcA, &IsValidLdStrOpc);
	assert(IsValidLdStrOpc &&
	"Given Opc should be a Load or Store with an immediate");
	// OpcA will be the first instruction in the pair.
	if (NonSExtOpc == getMatchingNonSExtOpcode(OpcB, &PairIsValidLdStrOpc)) {
	Flags.setSExtIdx(NonSExtOpc == (unsigned)OpcA ? 1 : 0);
	return true;
	}

	// If the second instruction isn't even a mergable/pairable load/store, bail
	// out.
	if (!PairIsValidLdStrOpc)
	return false;

	// FIXME: We don't support merging narrow stores with mixed scaled/unscaled
	// offsets.
	if (isNarrowStore(OpcA) \|\| isNarrowStore(OpcB))
	return false;

	// Try to match an unscaled load/store with a scaled load/store.
	return TII->isUnscaledLdSt(OpcA) != TII->isUnscaledLdSt(OpcB) &&
	getMatchingPairOpcode(OpcA) == getMatchingPairOpcode(OpcB);

	// FIXME: Can we also match a mixed sext/zext unscaled/scaled pair?
	}

	/// Scan the instructions looking for a load/store that can be combined with the
	/// current instruction into a wider equivalent or a load/store pair.
	MachineBasicBlock::iterator
	AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
	LdStPairFlags &Flags, unsigned Limit,
	bool FindNarrowMerge) {
	MachineBasicBlock::iterator E = I->getParent()->end();
	MachineBasicBlock::iterator MBBI = I;
	MachineInstr &FirstMI = *I;
	++MBBI;

	bool MayLoad = FirstMI.mayLoad();
	bool IsUnscaled = TII->isUnscaledLdSt(FirstMI);
	unsigned Reg = getLdStRegOp(FirstMI).getReg();
	unsigned BaseReg = getLdStBaseOp(FirstMI).getReg();
	int Offset = getLdStOffsetOp(FirstMI).getImm();
	int OffsetStride = IsUnscaled ? getMemScale(FirstMI) : 1;
	bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI);

	// Track which registers have been modified and used between the first insn
	// (inclusive) and the second insn.
	ModifiedRegs.reset();
	UsedRegs.reset();

	// Remember any instructions that read/write memory between FirstMI and MI.
	SmallVector<MachineInstr *, 4> MemInsns;

	for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
	MachineInstr &MI = *MBBI;

	// Don't count transient instructions towards the search limit since there
	// may be different numbers of them if e.g. debug information is present.
	if (!MI.isTransient())
	++Count;

	Flags.setSExtIdx(-1);
	if (areCandidatesToMergeOrPair(FirstMI, MI, Flags, TII) &&
	getLdStOffsetOp(MI).isImm()) {
	assert(MI.mayLoadOrStore() && "Expected memory operation.");
	// If we've found another instruction with the same opcode, check to see
	// if the base and offset are compatible with our starting instruction.
	// These instructions all have scaled immediate operands, so we just
	// check for +1/-1. Make sure to check the new instruction offset is
	// actually an immediate and not a symbolic reference destined for
	// a relocation.
	unsigned MIBaseReg = getLdStBaseOp(MI).getReg();
	int MIOffset = getLdStOffsetOp(MI).getImm();
	bool MIIsUnscaled = TII->isUnscaledLdSt(MI);
	if (IsUnscaled != MIIsUnscaled) {
	// We're trying to pair instructions that differ in how they are scaled.
	// If FirstMI is scaled then scale the offset of MI accordingly.
	// Otherwise, do the opposite (i.e., make MI's offset unscaled).
	int MemSize = getMemScale(MI);
	if (MIIsUnscaled) {
	// If the unscaled offset isn't a multiple of the MemSize, we can't
	// pair the operations together: bail and keep looking.
	if (MIOffset % MemSize) {
	trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
	MemInsns.push_back(&MI);
	continue;
	}
	MIOffset /= MemSize;
	} else {
	MIOffset *= MemSize;
	}
	}

	if (BaseReg == MIBaseReg && ((Offset == MIOffset + OffsetStride) \|\|
	(Offset + OffsetStride == MIOffset))) {
	int MinOffset = Offset < MIOffset ? Offset : MIOffset;
	if (FindNarrowMerge) {
	// If the alignment requirements of the scaled wide load/store
	// instruction can't express the offset of the scaled narrow input,
	// bail and keep looking. For promotable zero stores, allow only when
	// the stored value is the same (i.e., WZR).
	if ((!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) \|\|
	(IsPromotableZeroStore && Reg != getLdStRegOp(MI).getReg())) {
	trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
	MemInsns.push_back(&MI);
	continue;
	}
	} else {
	// Pairwise instructions have a 7-bit signed offset field. Single
	// insns have a 12-bit unsigned offset field. If the resultant
	// immediate offset of merging these instructions is out of range for
	// a pairwise instruction, bail and keep looking.
	if (!inBoundsForPair(IsUnscaled, MinOffset, OffsetStride)) {
	trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
	MemInsns.push_back(&MI);
	continue;
	}
	// If the alignment requirements of the paired (scaled) instruction
	// can't express the offset of the unscaled input, bail and keep
	// looking.
	if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) {
	trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
	MemInsns.push_back(&MI);
	continue;
	}
	}
	// If the destination register of the loads is the same register, bail
	// and keep looking. A load-pair instruction with both destination
	// registers the same is UNPREDICTABLE and will result in an exception.
	if (MayLoad && Reg == getLdStRegOp(MI).getReg()) {
	trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
	MemInsns.push_back(&MI);
	continue;
	}

	// If the Rt of the second instruction was not modified or used between
	// the two instructions and none of the instructions between the second
	// and first alias with the second, we can combine the second into the
	// first.
	if (!ModifiedRegs[getLdStRegOp(MI).getReg()] &&
	!(MI.mayLoad() && UsedRegs[getLdStRegOp(MI).getReg()]) &&
	!mayAlias(MI, MemInsns, AA)) {
	Flags.setMergeForward(false);
	return MBBI;
	}

	// Likewise, if the Rt of the first instruction is not modified or used
	// between the two instructions and none of the instructions between the
	// first and the second alias with the first, we can combine the first
	// into the second.
	if (!ModifiedRegs[getLdStRegOp(FirstMI).getReg()] &&
	!(MayLoad && UsedRegs[getLdStRegOp(FirstMI).getReg()]) &&
	!mayAlias(FirstMI, MemInsns, AA)) {
	Flags.setMergeForward(true);
	return MBBI;
	}
	// Unable to combine these instructions due to interference in between.
	// Keep looking.
	}
	}

	// If the instruction wasn't a matching load or store. Stop searching if we
	// encounter a call instruction that might modify memory.
	if (MI.isCall())
	return E;

	// Update modified / uses register lists.
	trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);

	// Otherwise, if the base register is modified, we have no match, so
	// return early.
	if (ModifiedRegs[BaseReg])
	return E;

	// Update list of instructions that read/write memory.
	if (MI.mayLoadOrStore())
	MemInsns.push_back(&MI);
	}
	return E;
	}

	MachineBasicBlock::iterator
	AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
	MachineBasicBlock::iterator Update,
	bool IsPreIdx) {
	assert((Update->getOpcode() == AArch64::ADDXri \|\|
	Update->getOpcode() == AArch64::SUBXri) &&
	"Unexpected base register update instruction to merge!");
	MachineBasicBlock::iterator NextI = I;
	// Return the instruction following the merged instruction, which is
	// the instruction following our unmerged load. Unless that's the add/sub
	// instruction we're merging, in which case it's the one after that.
	if (++NextI == Update)
	++NextI;

	int Value = Update->getOperand(2).getImm();
	assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
	"Can't merge 1 << 12 offset into pre-/post-indexed load / store");
	if (Update->getOpcode() == AArch64::SUBXri)
	Value = -Value;

	unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode())
	: getPostIndexedOpcode(I->getOpcode());
	MachineInstrBuilder MIB;
	if (!isPairedLdSt(*I)) {
	// Non-paired instruction.
	MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
	.add(getLdStRegOp(*Update))
	.add(getLdStRegOp(*I))
	.add(getLdStBaseOp(*I))
	.addImm(Value)
	.setMemRefs(I->memoperands_begin(), I->memoperands_end());
	} else {
	// Paired instruction.
	int Scale = getMemScale(*I);
	MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
	.add(getLdStRegOp(*Update))
	.add(getLdStRegOp(*I, 0))
	.add(getLdStRegOp(*I, 1))
	.add(getLdStBaseOp(*I))
	.addImm(Value / Scale)
	.setMemRefs(I->memoperands_begin(), I->memoperands_end());
	}
	(void)MIB;

	if (IsPreIdx)
	DEBUG(dbgs() << "Creating pre-indexed load/store.");
	else
	DEBUG(dbgs() << "Creating post-indexed load/store.");
	DEBUG(dbgs() << " Replacing instructions:\n ");
	DEBUG(I->print(dbgs()));
	DEBUG(dbgs() << " ");
	DEBUG(Update->print(dbgs()));
	DEBUG(dbgs() << " with instruction:\n ");
	DEBUG(((MachineInstr *)MIB)->print(dbgs()));
	DEBUG(dbgs() << "\n");

	// Erase the old instructions for the block.
	I->eraseFromParent();
	Update->eraseFromParent();

	return NextI;
	}

	bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
	MachineInstr &MI,
	unsigned BaseReg, int Offset) {
	switch (MI.getOpcode()) {
	default:
	break;
	case AArch64::SUBXri:
	case AArch64::ADDXri:
	// Make sure it's a vanilla immediate operand, not a relocation or
	// anything else we can't handle.
	if (!MI.getOperand(2).isImm())
	break;
	// Watch out for 1 << 12 shifted value.
	if (AArch64_AM::getShiftValue(MI.getOperand(3).getImm()))
	break;

	// The update instruction source and destination register must be the
	// same as the load/store base register.
	if (MI.getOperand(0).getReg() != BaseReg \|\|
	MI.getOperand(1).getReg() != BaseReg)
	break;

	bool IsPairedInsn = isPairedLdSt(MemMI);
	int UpdateOffset = MI.getOperand(2).getImm();
	if (MI.getOpcode() == AArch64::SUBXri)
	UpdateOffset = -UpdateOffset;

	// For non-paired load/store instructions, the immediate must fit in a
	// signed 9-bit integer.
	if (!IsPairedInsn && (UpdateOffset > 255 \|\| UpdateOffset < -256))
	break;

	// For paired load/store instructions, the immediate must be a multiple of
	// the scaling factor. The scaled offset must also fit into a signed 7-bit
	// integer.
	if (IsPairedInsn) {
	int Scale = getMemScale(MemMI);
	if (UpdateOffset % Scale != 0)
	break;

	int ScaledOffset = UpdateOffset / Scale;
	if (ScaledOffset > 63 \|\| ScaledOffset < -64)
	break;
	}

	// If we have a non-zero Offset, we check that it matches the amount
	// we're adding to the register.
	if (!Offset \|\| Offset == UpdateOffset)
	return true;
	break;
	}
	return false;
	}

	MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
	MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) {
	MachineBasicBlock::iterator E = I->getParent()->end();
	MachineInstr &MemMI = *I;
	MachineBasicBlock::iterator MBBI = I;

	unsigned BaseReg = getLdStBaseOp(MemMI).getReg();
	int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * getMemScale(MemMI);

	// Scan forward looking for post-index opportunities. Updating instructions
	// can't be formed if the memory instruction doesn't have the offset we're
	// looking for.
	if (MIUnscaledOffset != UnscaledOffset)
	return E;

	// If the base register overlaps a destination register, we can't
	// merge the update.
	bool IsPairedInsn = isPairedLdSt(MemMI);
	for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
	unsigned DestReg = getLdStRegOp(MemMI, i).getReg();
	if (DestReg == BaseReg \|\| TRI->isSubRegister(BaseReg, DestReg))
	return E;
	}

	// Track which registers have been modified and used between the first insn
	// (inclusive) and the second insn.
	ModifiedRegs.reset();
	UsedRegs.reset();
	++MBBI;
	for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
	MachineInstr &MI = *MBBI;

	// Don't count transient instructions towards the search limit since there
	// may be different numbers of them if e.g. debug information is present.
	if (!MI.isTransient())
	++Count;

	// If we found a match, return it.
	if (isMatchingUpdateInsn(*I, MI, BaseReg, UnscaledOffset))
	return MBBI;

	// Update the status of what the instruction clobbered and used.
	trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);

	// Otherwise, if the base register is used or modified, we have no match, so
	// return early.
	if (ModifiedRegs[BaseReg] \|\| UsedRegs[BaseReg])
	return E;
	}
	return E;
	}

	MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
	MachineBasicBlock::iterator I, unsigned Limit) {
	MachineBasicBlock::iterator B = I->getParent()->begin();
	MachineBasicBlock::iterator E = I->getParent()->end();
	MachineInstr &MemMI = *I;
	MachineBasicBlock::iterator MBBI = I;

	unsigned BaseReg = getLdStBaseOp(MemMI).getReg();
	int Offset = getLdStOffsetOp(MemMI).getImm();

	// If the load/store is the first instruction in the block, there's obviously
	// not any matching update. Ditto if the memory offset isn't zero.
	if (MBBI == B \|\| Offset != 0)
	return E;
	// If the base register overlaps a destination register, we can't
	// merge the update.
	bool IsPairedInsn = isPairedLdSt(MemMI);
	for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
	unsigned DestReg = getLdStRegOp(MemMI, i).getReg();
	if (DestReg == BaseReg \|\| TRI->isSubRegister(BaseReg, DestReg))
	return E;
	}

	// Track which registers have been modified and used between the first insn
	// (inclusive) and the second insn.
	ModifiedRegs.reset();
	UsedRegs.reset();
	unsigned Count = 0;
	do {
	--MBBI;
	MachineInstr &MI = *MBBI;

	// Don't count transient instructions towards the search limit since there
	// may be different numbers of them if e.g. debug information is present.
	if (!MI.isTransient())
	++Count;

	// If we found a match, return it.
	if (isMatchingUpdateInsn(*I, MI, BaseReg, Offset))
	return MBBI;

	// Update the status of what the instruction clobbered and used.
	trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);

	// Otherwise, if the base register is used or modified, we have no match, so
	// return early.
	if (ModifiedRegs[BaseReg] \|\| UsedRegs[BaseReg])
	return E;
	} while (MBBI != B && Count < Limit);
	return E;
	}

	bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
	MachineBasicBlock::iterator &MBBI) {
	MachineInstr &MI = *MBBI;
	// If this is a volatile load, don't mess with it.
	if (MI.hasOrderedMemoryRef())
	return false;

	// Make sure this is a reg+imm.
	// FIXME: It is possible to extend it to handle reg+reg cases.
	if (!getLdStOffsetOp(MI).isImm())
	return false;

	// Look backward up to LdStLimit instructions.
	MachineBasicBlock::iterator StoreI;
	if (findMatchingStore(MBBI, LdStLimit, StoreI)) {
	++NumLoadsFromStoresPromoted;
	// Promote the load. Keeping the iterator straight is a
	// pain, so we let the merge routine tell us what the next instruction
	// is after it's done mucking about.
	MBBI = promoteLoadFromStore(MBBI, StoreI);
	return true;
	}
	return false;
	}

	// Merge adjacent zero stores into a wider store.
	bool AArch64LoadStoreOpt::tryToMergeZeroStInst(
	MachineBasicBlock::iterator &MBBI) {
	assert(isPromotableZeroStoreInst(*MBBI) && "Expected narrow store.");
	MachineInstr &MI = *MBBI;
	MachineBasicBlock::iterator E = MI.getParent()->end();

	if (!TII->isCandidateToMergeOrPair(MI))
	return false;

	// Look ahead up to LdStLimit instructions for a mergable instruction.
	LdStPairFlags Flags;
	MachineBasicBlock::iterator MergeMI =
	findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ true);
	if (MergeMI != E) {
	++NumZeroStoresPromoted;

	// Keeping the iterator straight is a pain, so we let the merge routine tell
	// us what the next instruction is after it's done mucking about.
	MBBI = mergeNarrowZeroStores(MBBI, MergeMI, Flags);
	return true;
	}
	return false;
	}

	// Find loads and stores that can be merged into a single load or store pair
	// instruction.
	bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
	MachineInstr &MI = *MBBI;
	MachineBasicBlock::iterator E = MI.getParent()->end();

	if (!TII->isCandidateToMergeOrPair(MI))
	return false;

	// Early exit if the offset is not possible to match. (6 bits of positive
	// range, plus allow an extra one in case we find a later insn that matches
	// with Offset-1)
	bool IsUnscaled = TII->isUnscaledLdSt(MI);
	int Offset = getLdStOffsetOp(MI).getImm();
	int OffsetStride = IsUnscaled ? getMemScale(MI) : 1;
	// Allow one more for offset.
	if (Offset > 0)
	Offset -= OffsetStride;
	if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride))
	return false;

	// Look ahead up to LdStLimit instructions for a pairable instruction.
	LdStPairFlags Flags;
	MachineBasicBlock::iterator Paired =
	findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ false);
	if (Paired != E) {
	++NumPairCreated;
	if (TII->isUnscaledLdSt(MI))
	++NumUnscaledPairCreated;
	// Keeping the iterator straight is a pain, so we let the merge routine tell
	// us what the next instruction is after it's done mucking about.
	MBBI = mergePairedInsns(MBBI, Paired, Flags);
	return true;
	}
	return false;
	}

	bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
	bool EnableNarrowZeroStOpt) {
	bool Modified = false;
	// Four tranformations to do here:
	// 1) Find loads that directly read from stores and promote them by
	// replacing with mov instructions. If the store is wider than the load,
	// the load will be replaced with a bitfield extract.
	// e.g.,
	// str w1, [x0, #4]
	// ldrh w2, [x0, #6]
	// ; becomes
	// str w1, [x0, #4]
	// lsr w2, w1, #16
	for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
	MBBI != E;) {
	MachineInstr &MI = *MBBI;
	switch (MI.getOpcode()) {
	default:
	// Just move on to the next instruction.
	++MBBI;
	break;
	// Scaled instructions.
	case AArch64::LDRBBui:
	case AArch64::LDRHHui:
	case AArch64::LDRWui:
	case AArch64::LDRXui:
	// Unscaled instructions.
	case AArch64::LDURBBi:
	case AArch64::LDURHHi:
	case AArch64::LDURWi:
	case AArch64::LDURXi:
	if (tryToPromoteLoadFromStore(MBBI)) {
	Modified = true;
	break;
	}
	++MBBI;
	break;
	}
	}
	// 2) Merge adjacent zero stores into a wider store.
	// e.g.,
	// strh wzr, [x0]
	// strh wzr, [x0, #2]
	// ; becomes
	// str wzr, [x0]
	// e.g.,
	// str wzr, [x0]
	// str wzr, [x0, #4]
	// ; becomes
	// str xzr, [x0]
	for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
	EnableNarrowZeroStOpt && MBBI != E;) {
	if (isPromotableZeroStoreInst(*MBBI)) {
	if (tryToMergeZeroStInst(MBBI)) {
	Modified = true;
	} else
	++MBBI;
	} else
	++MBBI;
	}

	// 3) Find loads and stores that can be merged into a single load or store
	// pair instruction.
	// e.g.,
	// ldr x0, [x2]
	// ldr x1, [x2, #8]
	// ; becomes
	// ldp x0, x1, [x2]
	for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
	MBBI != E;) {
	if (TII->isPairableLdStInst(*MBBI) && tryToPairLdStInst(MBBI))
	Modified = true;
	else
	++MBBI;
	}
	// 4) Find base register updates that can be merged into the load or store
	// as a base-reg writeback.
	// e.g.,
	// ldr x0, [x2]
	// add x2, x2, #4
	// ; becomes
	// ldr x0, [x2], #4
	for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
	MBBI != E;) {
	MachineInstr &MI = *MBBI;
	// Do update merging. It's simpler to keep this separate from the above
	// switchs, though not strictly necessary.
	unsigned Opc = MI.getOpcode();
	switch (Opc) {
	default:
	// Just move on to the next instruction.
	++MBBI;
	break;
	// Scaled instructions.
	case AArch64::STRSui:
	case AArch64::STRDui:
	case AArch64::STRQui:
	case AArch64::STRXui:
	case AArch64::STRWui:
	case AArch64::STRHHui:
	case AArch64::STRBBui:
	case AArch64::LDRSui:
	case AArch64::LDRDui:
	case AArch64::LDRQui:
	case AArch64::LDRXui:
	case AArch64::LDRWui:
	case AArch64::LDRHHui:
	case AArch64::LDRBBui:
	// Unscaled instructions.
	case AArch64::STURSi:
	case AArch64::STURDi:
	case AArch64::STURQi:
	case AArch64::STURWi:
	case AArch64::STURXi:
	case AArch64::LDURSi:
	case AArch64::LDURDi:
	case AArch64::LDURQi:
	case AArch64::LDURWi:
	case AArch64::LDURXi:
	// Paired instructions.
	case AArch64::LDPSi:
	case AArch64::LDPSWi:
	case AArch64::LDPDi:
	case AArch64::LDPQi:
	case AArch64::LDPWi:
	case AArch64::LDPXi:
	case AArch64::STPSi:
	case AArch64::STPDi:
	case AArch64::STPQi:
	case AArch64::STPWi:
	case AArch64::STPXi: {
	// Make sure this is a reg+imm (as opposed to an address reloc).
	if (!getLdStOffsetOp(MI).isImm()) {
	++MBBI;
	break;
	}
	// Look forward to try to form a post-index instruction. For example,
	// ldr x0, [x20]
	// add x20, x20, #32
	// merged into:
	// ldr x0, [x20], #32
	MachineBasicBlock::iterator Update =
	findMatchingUpdateInsnForward(MBBI, 0, UpdateLimit);
	if (Update != E) {
	// Merge the update into the ld/st.
	MBBI = mergeUpdateInsn(MBBI, Update, /IsPreIdx=/false);
	Modified = true;
	++NumPostFolded;
	break;
	}
	- // Don't know how to handle pre/post-index versions, so move to the next
	- // instruction.
	+
	+ // Don't know how to handle unscaled pre/post-index versions below, so
	+ // move to the next instruction.
	if (TII->isUnscaledLdSt(Opc)) {
	++MBBI;
	break;
	}

	// Look back to try to find a pre-index instruction. For example,
	// add x0, x0, #8
	// ldr x1, [x0]
	// merged into:
	// ldr x1, [x0, #8]!
	Update = findMatchingUpdateInsnBackward(MBBI, UpdateLimit);
	if (Update != E) {
	// Merge the update into the ld/st.
	MBBI = mergeUpdateInsn(MBBI, Update, /IsPreIdx=/true);
	Modified = true;
	++NumPreFolded;
	break;
	}
	// The immediate in the load/store is scaled by the size of the memory
	// operation. The immediate in the add we're looking for,
	// however, is not, so adjust here.
	int UnscaledOffset = getLdStOffsetOp(MI).getImm() * getMemScale(MI);

	// Look forward to try to find a post-index instruction. For example,
	// ldr x1, [x0, #64]
	// add x0, x0, #64
	// merged into:
	// ldr x1, [x0, #64]!
	Update = findMatchingUpdateInsnForward(MBBI, UnscaledOffset, UpdateLimit);
	if (Update != E) {
	// Merge the update into the ld/st.
	MBBI = mergeUpdateInsn(MBBI, Update, /IsPreIdx=/true);
	Modified = true;
	++NumPreFolded;
	break;
	}

	// Nothing found. Just move to the next instruction.
	++MBBI;
	break;
	}
	}
	}

	return Modified;
	}

	bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
	if (skipFunction(*Fn.getFunction()))
	return false;

	Subtarget = &static_cast<const AArch64Subtarget &>(Fn.getSubtarget());
	TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo());
	TRI = Subtarget->getRegisterInfo();
	AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();

	// Resize the modified and used register bitfield trackers. We do this once
	// per function and then clear the bitfield each time we optimize a load or
	// store.
	ModifiedRegs.resize(TRI->getNumRegs());
	UsedRegs.resize(TRI->getNumRegs());

	bool Modified = false;
	bool enableNarrowZeroStOpt = !Subtarget->requiresStrictAlign();
	for (auto &MBB : Fn)
	Modified \|= optimizeBlock(MBB, enableNarrowZeroStOpt);

	return Modified;
	}

	// FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep loads and
	// stores near one another? Note: The pre-RA instruction scheduler already has
	// hooks to try and schedule pairable loads/stores together to improve pairing
	// opportunities. Thus, pre-RA pairing pass may not be worth the effort.

	// FIXME: When pairing store instructions it's very possible for this pass to
	// hoist a store with a KILL marker above another use (without a KILL marker).
	// The resulting IR is invalid, but nothing uses the KILL markers after this
	// pass, so it's never caused a problem in practice.

	/// createAArch64LoadStoreOptimizationPass - returns an instance of the
	/// load / store optimization pass.
	FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() {
	return new AArch64LoadStoreOpt();
	}
	diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
	index ec49f0d37af4..46d8f0dba691 100644
	--- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp
	+++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
	@@ -1,1726 +1,1710 @@
	//===-- ARMExpandPseudoInsts.cpp - Expand pseudo instructions -------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file contains a pass that expands pseudo instructions into target
	// instructions to allow proper scheduling, if-conversion, and other late
	// optimizations. This pass should be run after register allocation but before
	// the post-regalloc scheduling pass.
	//
	//===----------------------------------------------------------------------===//

	#include "ARM.h"
	#include "ARMBaseInstrInfo.h"
	#include "ARMBaseRegisterInfo.h"
	#include "ARMConstantPoolValue.h"
	#include "ARMMachineFunctionInfo.h"
	#include "ARMSubtarget.h"
	#include "MCTargetDesc/ARMAddressingModes.h"
	#include "llvm/CodeGen/LivePhysRegs.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunctionPass.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineInstrBundle.h"
	#include "llvm/IR/GlobalValue.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/raw_ostream.h" // FIXME: for debug only. remove!
	#include "llvm/Target/TargetFrameLowering.h"
	#include "llvm/Target/TargetRegisterInfo.h"

	using namespace llvm;

	#define DEBUG_TYPE "arm-pseudo"

	static cl::opt<bool>
	VerifyARMPseudo("verify-arm-pseudo-expand", cl::Hidden,
	cl::desc("Verify machine code after expanding ARM pseudos"));

	namespace {
	class ARMExpandPseudo : public MachineFunctionPass {
	public:
	static char ID;
	ARMExpandPseudo() : MachineFunctionPass(ID) {}

	const ARMBaseInstrInfo *TII;
	const TargetRegisterInfo *TRI;
	const ARMSubtarget *STI;
	ARMFunctionInfo *AFI;

	bool runOnMachineFunction(MachineFunction &Fn) override;

	MachineFunctionProperties getRequiredProperties() const override {
	return MachineFunctionProperties().set(
	MachineFunctionProperties::Property::NoVRegs);
	}

	StringRef getPassName() const override {
	return "ARM pseudo instruction expansion pass";
	}

	private:
	void TransferImpOps(MachineInstr &OldMI,
	MachineInstrBuilder &UseMI, MachineInstrBuilder &DefMI);
	bool ExpandMI(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MBBI,
	MachineBasicBlock::iterator &NextMBBI);
	bool ExpandMBB(MachineBasicBlock &MBB);
	void ExpandVLD(MachineBasicBlock::iterator &MBBI);
	void ExpandVST(MachineBasicBlock::iterator &MBBI);
	void ExpandLaneOp(MachineBasicBlock::iterator &MBBI);
	void ExpandVTBL(MachineBasicBlock::iterator &MBBI,
	unsigned Opc, bool IsExt);
	void ExpandMOV32BitImm(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator &MBBI);
	bool ExpandCMP_SWAP(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MBBI, unsigned LdrexOp,
	unsigned StrexOp, unsigned UxtOp,
	MachineBasicBlock::iterator &NextMBBI);

	bool ExpandCMP_SWAP_64(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MBBI,
	MachineBasicBlock::iterator &NextMBBI);
	};
	char ARMExpandPseudo::ID = 0;
	}

	/// TransferImpOps - Transfer implicit operands on the pseudo instruction to
	/// the instructions created from the expansion.
	void ARMExpandPseudo::TransferImpOps(MachineInstr &OldMI,
	MachineInstrBuilder &UseMI,
	MachineInstrBuilder &DefMI) {
	const MCInstrDesc &Desc = OldMI.getDesc();
	for (unsigned i = Desc.getNumOperands(), e = OldMI.getNumOperands();
	i != e; ++i) {
	const MachineOperand &MO = OldMI.getOperand(i);
	assert(MO.isReg() && MO.getReg());
	if (MO.isUse())
	UseMI.add(MO);
	else
	DefMI.add(MO);
	}
	}

	namespace {
	// Constants for register spacing in NEON load/store instructions.
	// For quad-register load-lane and store-lane pseudo instructors, the
	// spacing is initially assumed to be EvenDblSpc, and that is changed to
	// OddDblSpc depending on the lane number operand.
	enum NEONRegSpacing {
	SingleSpc,
	EvenDblSpc,
	OddDblSpc
	};

	// Entries for NEON load/store information table. The table is sorted by
	// PseudoOpc for fast binary-search lookups.
	struct NEONLdStTableEntry {
	uint16_t PseudoOpc;
	uint16_t RealOpc;
	bool IsLoad;
	bool isUpdating;
	bool hasWritebackOperand;
	uint8_t RegSpacing; // One of type NEONRegSpacing
	uint8_t NumRegs; // D registers loaded or stored
	uint8_t RegElts; // elements per D register; used for lane ops
	// FIXME: Temporary flag to denote whether the real instruction takes
	// a single register (like the encoding) or all of the registers in
	// the list (like the asm syntax and the isel DAG). When all definitions
	// are converted to take only the single encoded register, this will
	// go away.
	bool copyAllListRegs;

	// Comparison methods for binary search of the table.
	bool operator<(const NEONLdStTableEntry &TE) const {
	return PseudoOpc < TE.PseudoOpc;
	}
	friend bool operator<(const NEONLdStTableEntry &TE, unsigned PseudoOpc) {
	return TE.PseudoOpc < PseudoOpc;
	}
	friend bool LLVM_ATTRIBUTE_UNUSED operator<(unsigned PseudoOpc,
	const NEONLdStTableEntry &TE) {
	return PseudoOpc < TE.PseudoOpc;
	}
	};
	}

	static const NEONLdStTableEntry NEONLdStTable[] = {
	{ ARM::VLD1LNq16Pseudo, ARM::VLD1LNd16, true, false, false, EvenDblSpc, 1, 4 ,true},
	{ ARM::VLD1LNq16Pseudo_UPD, ARM::VLD1LNd16_UPD, true, true, true, EvenDblSpc, 1, 4 ,true},
	{ ARM::VLD1LNq32Pseudo, ARM::VLD1LNd32, true, false, false, EvenDblSpc, 1, 2 ,true},
	{ ARM::VLD1LNq32Pseudo_UPD, ARM::VLD1LNd32_UPD, true, true, true, EvenDblSpc, 1, 2 ,true},
	{ ARM::VLD1LNq8Pseudo, ARM::VLD1LNd8, true, false, false, EvenDblSpc, 1, 8 ,true},
	{ ARM::VLD1LNq8Pseudo_UPD, ARM::VLD1LNd8_UPD, true, true, true, EvenDblSpc, 1, 8 ,true},

	{ ARM::VLD1d64QPseudo, ARM::VLD1d64Q, true, false, false, SingleSpc, 4, 1 ,false},
	{ ARM::VLD1d64QPseudoWB_fixed, ARM::VLD1d64Qwb_fixed, true, true, false, SingleSpc, 4, 1 ,false},
	{ ARM::VLD1d64TPseudo, ARM::VLD1d64T, true, false, false, SingleSpc, 3, 1 ,false},
	{ ARM::VLD1d64TPseudoWB_fixed, ARM::VLD1d64Twb_fixed, true, true, false, SingleSpc, 3, 1 ,false},

	{ ARM::VLD2LNd16Pseudo, ARM::VLD2LNd16, true, false, false, SingleSpc, 2, 4 ,true},
	{ ARM::VLD2LNd16Pseudo_UPD, ARM::VLD2LNd16_UPD, true, true, true, SingleSpc, 2, 4 ,true},
	{ ARM::VLD2LNd32Pseudo, ARM::VLD2LNd32, true, false, false, SingleSpc, 2, 2 ,true},
	{ ARM::VLD2LNd32Pseudo_UPD, ARM::VLD2LNd32_UPD, true, true, true, SingleSpc, 2, 2 ,true},
	{ ARM::VLD2LNd8Pseudo, ARM::VLD2LNd8, true, false, false, SingleSpc, 2, 8 ,true},
	{ ARM::VLD2LNd8Pseudo_UPD, ARM::VLD2LNd8_UPD, true, true, true, SingleSpc, 2, 8 ,true},
	{ ARM::VLD2LNq16Pseudo, ARM::VLD2LNq16, true, false, false, EvenDblSpc, 2, 4 ,true},
	{ ARM::VLD2LNq16Pseudo_UPD, ARM::VLD2LNq16_UPD, true, true, true, EvenDblSpc, 2, 4 ,true},
	{ ARM::VLD2LNq32Pseudo, ARM::VLD2LNq32, true, false, false, EvenDblSpc, 2, 2 ,true},
	{ ARM::VLD2LNq32Pseudo_UPD, ARM::VLD2LNq32_UPD, true, true, true, EvenDblSpc, 2, 2 ,true},

	{ ARM::VLD2q16Pseudo, ARM::VLD2q16, true, false, false, SingleSpc, 4, 4 ,false},
	{ ARM::VLD2q16PseudoWB_fixed, ARM::VLD2q16wb_fixed, true, true, false, SingleSpc, 4, 4 ,false},
	{ ARM::VLD2q16PseudoWB_register, ARM::VLD2q16wb_register, true, true, true, SingleSpc, 4, 4 ,false},
	{ ARM::VLD2q32Pseudo, ARM::VLD2q32, true, false, false, SingleSpc, 4, 2 ,false},
	{ ARM::VLD2q32PseudoWB_fixed, ARM::VLD2q32wb_fixed, true, true, false, SingleSpc, 4, 2 ,false},
	{ ARM::VLD2q32PseudoWB_register, ARM::VLD2q32wb_register, true, true, true, SingleSpc, 4, 2 ,false},
	{ ARM::VLD2q8Pseudo, ARM::VLD2q8, true, false, false, SingleSpc, 4, 8 ,false},
	{ ARM::VLD2q8PseudoWB_fixed, ARM::VLD2q8wb_fixed, true, true, false, SingleSpc, 4, 8 ,false},
	{ ARM::VLD2q8PseudoWB_register, ARM::VLD2q8wb_register, true, true, true, SingleSpc, 4, 8 ,false},

	{ ARM::VLD3DUPd16Pseudo, ARM::VLD3DUPd16, true, false, false, SingleSpc, 3, 4,true},
	{ ARM::VLD3DUPd16Pseudo_UPD, ARM::VLD3DUPd16_UPD, true, true, true, SingleSpc, 3, 4,true},
	{ ARM::VLD3DUPd32Pseudo, ARM::VLD3DUPd32, true, false, false, SingleSpc, 3, 2,true},
	{ ARM::VLD3DUPd32Pseudo_UPD, ARM::VLD3DUPd32_UPD, true, true, true, SingleSpc, 3, 2,true},
	{ ARM::VLD3DUPd8Pseudo, ARM::VLD3DUPd8, true, false, false, SingleSpc, 3, 8,true},
	{ ARM::VLD3DUPd8Pseudo_UPD, ARM::VLD3DUPd8_UPD, true, true, true, SingleSpc, 3, 8,true},

	{ ARM::VLD3LNd16Pseudo, ARM::VLD3LNd16, true, false, false, SingleSpc, 3, 4 ,true},
	{ ARM::VLD3LNd16Pseudo_UPD, ARM::VLD3LNd16_UPD, true, true, true, SingleSpc, 3, 4 ,true},
	{ ARM::VLD3LNd32Pseudo, ARM::VLD3LNd32, true, false, false, SingleSpc, 3, 2 ,true},
	{ ARM::VLD3LNd32Pseudo_UPD, ARM::VLD3LNd32_UPD, true, true, true, SingleSpc, 3, 2 ,true},
	{ ARM::VLD3LNd8Pseudo, ARM::VLD3LNd8, true, false, false, SingleSpc, 3, 8 ,true},
	{ ARM::VLD3LNd8Pseudo_UPD, ARM::VLD3LNd8_UPD, true, true, true, SingleSpc, 3, 8 ,true},
	{ ARM::VLD3LNq16Pseudo, ARM::VLD3LNq16, true, false, false, EvenDblSpc, 3, 4 ,true},
	{ ARM::VLD3LNq16Pseudo_UPD, ARM::VLD3LNq16_UPD, true, true, true, EvenDblSpc, 3, 4 ,true},
	{ ARM::VLD3LNq32Pseudo, ARM::VLD3LNq32, true, false, false, EvenDblSpc, 3, 2 ,true},
	{ ARM::VLD3LNq32Pseudo_UPD, ARM::VLD3LNq32_UPD, true, true, true, EvenDblSpc, 3, 2 ,true},

	{ ARM::VLD3d16Pseudo, ARM::VLD3d16, true, false, false, SingleSpc, 3, 4 ,true},
	{ ARM::VLD3d16Pseudo_UPD, ARM::VLD3d16_UPD, true, true, true, SingleSpc, 3, 4 ,true},
	{ ARM::VLD3d32Pseudo, ARM::VLD3d32, true, false, false, SingleSpc, 3, 2 ,true},
	{ ARM::VLD3d32Pseudo_UPD, ARM::VLD3d32_UPD, true, true, true, SingleSpc, 3, 2 ,true},
	{ ARM::VLD3d8Pseudo, ARM::VLD3d8, true, false, false, SingleSpc, 3, 8 ,true},
	{ ARM::VLD3d8Pseudo_UPD, ARM::VLD3d8_UPD, true, true, true, SingleSpc, 3, 8 ,true},

	{ ARM::VLD3q16Pseudo_UPD, ARM::VLD3q16_UPD, true, true, true, EvenDblSpc, 3, 4 ,true},
	{ ARM::VLD3q16oddPseudo, ARM::VLD3q16, true, false, false, OddDblSpc, 3, 4 ,true},
	{ ARM::VLD3q16oddPseudo_UPD, ARM::VLD3q16_UPD, true, true, true, OddDblSpc, 3, 4 ,true},
	{ ARM::VLD3q32Pseudo_UPD, ARM::VLD3q32_UPD, true, true, true, EvenDblSpc, 3, 2 ,true},
	{ ARM::VLD3q32oddPseudo, ARM::VLD3q32, true, false, false, OddDblSpc, 3, 2 ,true},
	{ ARM::VLD3q32oddPseudo_UPD, ARM::VLD3q32_UPD, true, true, true, OddDblSpc, 3, 2 ,true},
	{ ARM::VLD3q8Pseudo_UPD, ARM::VLD3q8_UPD, true, true, true, EvenDblSpc, 3, 8 ,true},
	{ ARM::VLD3q8oddPseudo, ARM::VLD3q8, true, false, false, OddDblSpc, 3, 8 ,true},
	{ ARM::VLD3q8oddPseudo_UPD, ARM::VLD3q8_UPD, true, true, true, OddDblSpc, 3, 8 ,true},

	{ ARM::VLD4DUPd16Pseudo, ARM::VLD4DUPd16, true, false, false, SingleSpc, 4, 4,true},
	{ ARM::VLD4DUPd16Pseudo_UPD, ARM::VLD4DUPd16_UPD, true, true, true, SingleSpc, 4, 4,true},
	{ ARM::VLD4DUPd32Pseudo, ARM::VLD4DUPd32, true, false, false, SingleSpc, 4, 2,true},
	{ ARM::VLD4DUPd32Pseudo_UPD, ARM::VLD4DUPd32_UPD, true, true, true, SingleSpc, 4, 2,true},
	{ ARM::VLD4DUPd8Pseudo, ARM::VLD4DUPd8, true, false, false, SingleSpc, 4, 8,true},
	{ ARM::VLD4DUPd8Pseudo_UPD, ARM::VLD4DUPd8_UPD, true, true, true, SingleSpc, 4, 8,true},

	{ ARM::VLD4LNd16Pseudo, ARM::VLD4LNd16, true, false, false, SingleSpc, 4, 4 ,true},
	{ ARM::VLD4LNd16Pseudo_UPD, ARM::VLD4LNd16_UPD, true, true, true, SingleSpc, 4, 4 ,true},
	{ ARM::VLD4LNd32Pseudo, ARM::VLD4LNd32, true, false, false, SingleSpc, 4, 2 ,true},
	{ ARM::VLD4LNd32Pseudo_UPD, ARM::VLD4LNd32_UPD, true, true, true, SingleSpc, 4, 2 ,true},
	{ ARM::VLD4LNd8Pseudo, ARM::VLD4LNd8, true, false, false, SingleSpc, 4, 8 ,true},
	{ ARM::VLD4LNd8Pseudo_UPD, ARM::VLD4LNd8_UPD, true, true, true, SingleSpc, 4, 8 ,true},
	{ ARM::VLD4LNq16Pseudo, ARM::VLD4LNq16, true, false, false, EvenDblSpc, 4, 4 ,true},
	{ ARM::VLD4LNq16Pseudo_UPD, ARM::VLD4LNq16_UPD, true, true, true, EvenDblSpc, 4, 4 ,true},
	{ ARM::VLD4LNq32Pseudo, ARM::VLD4LNq32, true, false, false, EvenDblSpc, 4, 2 ,true},
	{ ARM::VLD4LNq32Pseudo_UPD, ARM::VLD4LNq32_UPD, true, true, true, EvenDblSpc, 4, 2 ,true},

	{ ARM::VLD4d16Pseudo, ARM::VLD4d16, true, false, false, SingleSpc, 4, 4 ,true},
	{ ARM::VLD4d16Pseudo_UPD, ARM::VLD4d16_UPD, true, true, true, SingleSpc, 4, 4 ,true},
	{ ARM::VLD4d32Pseudo, ARM::VLD4d32, true, false, false, SingleSpc, 4, 2 ,true},
	{ ARM::VLD4d32Pseudo_UPD, ARM::VLD4d32_UPD, true, true, true, SingleSpc, 4, 2 ,true},
	{ ARM::VLD4d8Pseudo, ARM::VLD4d8, true, false, false, SingleSpc, 4, 8 ,true},
	{ ARM::VLD4d8Pseudo_UPD, ARM::VLD4d8_UPD, true, true, true, SingleSpc, 4, 8 ,true},

	{ ARM::VLD4q16Pseudo_UPD, ARM::VLD4q16_UPD, true, true, true, EvenDblSpc, 4, 4 ,true},
	{ ARM::VLD4q16oddPseudo, ARM::VLD4q16, true, false, false, OddDblSpc, 4, 4 ,true},
	{ ARM::VLD4q16oddPseudo_UPD, ARM::VLD4q16_UPD, true, true, true, OddDblSpc, 4, 4 ,true},
	{ ARM::VLD4q32Pseudo_UPD, ARM::VLD4q32_UPD, true, true, true, EvenDblSpc, 4, 2 ,true},
	{ ARM::VLD4q32oddPseudo, ARM::VLD4q32, true, false, false, OddDblSpc, 4, 2 ,true},
	{ ARM::VLD4q32oddPseudo_UPD, ARM::VLD4q32_UPD, true, true, true, OddDblSpc, 4, 2 ,true},
	{ ARM::VLD4q8Pseudo_UPD, ARM::VLD4q8_UPD, true, true, true, EvenDblSpc, 4, 8 ,true},
	{ ARM::VLD4q8oddPseudo, ARM::VLD4q8, true, false, false, OddDblSpc, 4, 8 ,true},
	{ ARM::VLD4q8oddPseudo_UPD, ARM::VLD4q8_UPD, true, true, true, OddDblSpc, 4, 8 ,true},

	{ ARM::VST1LNq16Pseudo, ARM::VST1LNd16, false, false, false, EvenDblSpc, 1, 4 ,true},
	{ ARM::VST1LNq16Pseudo_UPD, ARM::VST1LNd16_UPD, false, true, true, EvenDblSpc, 1, 4 ,true},
	{ ARM::VST1LNq32Pseudo, ARM::VST1LNd32, false, false, false, EvenDblSpc, 1, 2 ,true},
	{ ARM::VST1LNq32Pseudo_UPD, ARM::VST1LNd32_UPD, false, true, true, EvenDblSpc, 1, 2 ,true},
	{ ARM::VST1LNq8Pseudo, ARM::VST1LNd8, false, false, false, EvenDblSpc, 1, 8 ,true},
	{ ARM::VST1LNq8Pseudo_UPD, ARM::VST1LNd8_UPD, false, true, true, EvenDblSpc, 1, 8 ,true},

	{ ARM::VST1d64QPseudo, ARM::VST1d64Q, false, false, false, SingleSpc, 4, 1 ,false},
	{ ARM::VST1d64QPseudoWB_fixed, ARM::VST1d64Qwb_fixed, false, true, false, SingleSpc, 4, 1 ,false},
	{ ARM::VST1d64QPseudoWB_register, ARM::VST1d64Qwb_register, false, true, true, SingleSpc, 4, 1 ,false},
	{ ARM::VST1d64TPseudo, ARM::VST1d64T, false, false, false, SingleSpc, 3, 1 ,false},
	{ ARM::VST1d64TPseudoWB_fixed, ARM::VST1d64Twb_fixed, false, true, false, SingleSpc, 3, 1 ,false},
	{ ARM::VST1d64TPseudoWB_register, ARM::VST1d64Twb_register, false, true, true, SingleSpc, 3, 1 ,false},

	{ ARM::VST2LNd16Pseudo, ARM::VST2LNd16, false, false, false, SingleSpc, 2, 4 ,true},
	{ ARM::VST2LNd16Pseudo_UPD, ARM::VST2LNd16_UPD, false, true, true, SingleSpc, 2, 4 ,true},
	{ ARM::VST2LNd32Pseudo, ARM::VST2LNd32, false, false, false, SingleSpc, 2, 2 ,true},
	{ ARM::VST2LNd32Pseudo_UPD, ARM::VST2LNd32_UPD, false, true, true, SingleSpc, 2, 2 ,true},
	{ ARM::VST2LNd8Pseudo, ARM::VST2LNd8, false, false, false, SingleSpc, 2, 8 ,true},
	{ ARM::VST2LNd8Pseudo_UPD, ARM::VST2LNd8_UPD, false, true, true, SingleSpc, 2, 8 ,true},
	{ ARM::VST2LNq16Pseudo, ARM::VST2LNq16, false, false, false, EvenDblSpc, 2, 4,true},
	{ ARM::VST2LNq16Pseudo_UPD, ARM::VST2LNq16_UPD, false, true, true, EvenDblSpc, 2, 4,true},
	{ ARM::VST2LNq32Pseudo, ARM::VST2LNq32, false, false, false, EvenDblSpc, 2, 2,true},
	{ ARM::VST2LNq32Pseudo_UPD, ARM::VST2LNq32_UPD, false, true, true, EvenDblSpc, 2, 2,true},

	{ ARM::VST2q16Pseudo, ARM::VST2q16, false, false, false, SingleSpc, 4, 4 ,false},
	{ ARM::VST2q16PseudoWB_fixed, ARM::VST2q16wb_fixed, false, true, false, SingleSpc, 4, 4 ,false},
	{ ARM::VST2q16PseudoWB_register, ARM::VST2q16wb_register, false, true, true, SingleSpc, 4, 4 ,false},
	{ ARM::VST2q32Pseudo, ARM::VST2q32, false, false, false, SingleSpc, 4, 2 ,false},
	{ ARM::VST2q32PseudoWB_fixed, ARM::VST2q32wb_fixed, false, true, false, SingleSpc, 4, 2 ,false},
	{ ARM::VST2q32PseudoWB_register, ARM::VST2q32wb_register, false, true, true, SingleSpc, 4, 2 ,false},
	{ ARM::VST2q8Pseudo, ARM::VST2q8, false, false, false, SingleSpc, 4, 8 ,false},
	{ ARM::VST2q8PseudoWB_fixed, ARM::VST2q8wb_fixed, false, true, false, SingleSpc, 4, 8 ,false},
	{ ARM::VST2q8PseudoWB_register, ARM::VST2q8wb_register, false, true, true, SingleSpc, 4, 8 ,false},

	{ ARM::VST3LNd16Pseudo, ARM::VST3LNd16, false, false, false, SingleSpc, 3, 4 ,true},
	{ ARM::VST3LNd16Pseudo_UPD, ARM::VST3LNd16_UPD, false, true, true, SingleSpc, 3, 4 ,true},
	{ ARM::VST3LNd32Pseudo, ARM::VST3LNd32, false, false, false, SingleSpc, 3, 2 ,true},
	{ ARM::VST3LNd32Pseudo_UPD, ARM::VST3LNd32_UPD, false, true, true, SingleSpc, 3, 2 ,true},
	{ ARM::VST3LNd8Pseudo, ARM::VST3LNd8, false, false, false, SingleSpc, 3, 8 ,true},
	{ ARM::VST3LNd8Pseudo_UPD, ARM::VST3LNd8_UPD, false, true, true, SingleSpc, 3, 8 ,true},
	{ ARM::VST3LNq16Pseudo, ARM::VST3LNq16, false, false, false, EvenDblSpc, 3, 4,true},
	{ ARM::VST3LNq16Pseudo_UPD, ARM::VST3LNq16_UPD, false, true, true, EvenDblSpc, 3, 4,true},
	{ ARM::VST3LNq32Pseudo, ARM::VST3LNq32, false, false, false, EvenDblSpc, 3, 2,true},
	{ ARM::VST3LNq32Pseudo_UPD, ARM::VST3LNq32_UPD, false, true, true, EvenDblSpc, 3, 2,true},

	{ ARM::VST3d16Pseudo, ARM::VST3d16, false, false, false, SingleSpc, 3, 4 ,true},
	{ ARM::VST3d16Pseudo_UPD, ARM::VST3d16_UPD, false, true, true, SingleSpc, 3, 4 ,true},
	{ ARM::VST3d32Pseudo, ARM::VST3d32, false, false, false, SingleSpc, 3, 2 ,true},
	{ ARM::VST3d32Pseudo_UPD, ARM::VST3d32_UPD, false, true, true, SingleSpc, 3, 2 ,true},
	{ ARM::VST3d8Pseudo, ARM::VST3d8, false, false, false, SingleSpc, 3, 8 ,true},
	{ ARM::VST3d8Pseudo_UPD, ARM::VST3d8_UPD, false, true, true, SingleSpc, 3, 8 ,true},

	{ ARM::VST3q16Pseudo_UPD, ARM::VST3q16_UPD, false, true, true, EvenDblSpc, 3, 4 ,true},
	{ ARM::VST3q16oddPseudo, ARM::VST3q16, false, false, false, OddDblSpc, 3, 4 ,true},
	{ ARM::VST3q16oddPseudo_UPD, ARM::VST3q16_UPD, false, true, true, OddDblSpc, 3, 4 ,true},
	{ ARM::VST3q32Pseudo_UPD, ARM::VST3q32_UPD, false, true, true, EvenDblSpc, 3, 2 ,true},
	{ ARM::VST3q32oddPseudo, ARM::VST3q32, false, false, false, OddDblSpc, 3, 2 ,true},
	{ ARM::VST3q32oddPseudo_UPD, ARM::VST3q32_UPD, false, true, true, OddDblSpc, 3, 2 ,true},
	{ ARM::VST3q8Pseudo_UPD, ARM::VST3q8_UPD, false, true, true, EvenDblSpc, 3, 8 ,true},
	{ ARM::VST3q8oddPseudo, ARM::VST3q8, false, false, false, OddDblSpc, 3, 8 ,true},
	{ ARM::VST3q8oddPseudo_UPD, ARM::VST3q8_UPD, false, true, true, OddDblSpc, 3, 8 ,true},

	{ ARM::VST4LNd16Pseudo, ARM::VST4LNd16, false, false, false, SingleSpc, 4, 4 ,true},
	{ ARM::VST4LNd16Pseudo_UPD, ARM::VST4LNd16_UPD, false, true, true, SingleSpc, 4, 4 ,true},
	{ ARM::VST4LNd32Pseudo, ARM::VST4LNd32, false, false, false, SingleSpc, 4, 2 ,true},
	{ ARM::VST4LNd32Pseudo_UPD, ARM::VST4LNd32_UPD, false, true, true, SingleSpc, 4, 2 ,true},
	{ ARM::VST4LNd8Pseudo, ARM::VST4LNd8, false, false, false, SingleSpc, 4, 8 ,true},
	{ ARM::VST4LNd8Pseudo_UPD, ARM::VST4LNd8_UPD, false, true, true, SingleSpc, 4, 8 ,true},
	{ ARM::VST4LNq16Pseudo, ARM::VST4LNq16, false, false, false, EvenDblSpc, 4, 4,true},
	{ ARM::VST4LNq16Pseudo_UPD, ARM::VST4LNq16_UPD, false, true, true, EvenDblSpc, 4, 4,true},
	{ ARM::VST4LNq32Pseudo, ARM::VST4LNq32, false, false, false, EvenDblSpc, 4, 2,true},
	{ ARM::VST4LNq32Pseudo_UPD, ARM::VST4LNq32_UPD, false, true, true, EvenDblSpc, 4, 2,true},

	{ ARM::VST4d16Pseudo, ARM::VST4d16, false, false, false, SingleSpc, 4, 4 ,true},
	{ ARM::VST4d16Pseudo_UPD, ARM::VST4d16_UPD, false, true, true, SingleSpc, 4, 4 ,true},
	{ ARM::VST4d32Pseudo, ARM::VST4d32, false, false, false, SingleSpc, 4, 2 ,true},
	{ ARM::VST4d32Pseudo_UPD, ARM::VST4d32_UPD, false, true, true, SingleSpc, 4, 2 ,true},
	{ ARM::VST4d8Pseudo, ARM::VST4d8, false, false, false, SingleSpc, 4, 8 ,true},
	{ ARM::VST4d8Pseudo_UPD, ARM::VST4d8_UPD, false, true, true, SingleSpc, 4, 8 ,true},

	{ ARM::VST4q16Pseudo_UPD, ARM::VST4q16_UPD, false, true, true, EvenDblSpc, 4, 4 ,true},
	{ ARM::VST4q16oddPseudo, ARM::VST4q16, false, false, false, OddDblSpc, 4, 4 ,true},
	{ ARM::VST4q16oddPseudo_UPD, ARM::VST4q16_UPD, false, true, true, OddDblSpc, 4, 4 ,true},
	{ ARM::VST4q32Pseudo_UPD, ARM::VST4q32_UPD, false, true, true, EvenDblSpc, 4, 2 ,true},
	{ ARM::VST4q32oddPseudo, ARM::VST4q32, false, false, false, OddDblSpc, 4, 2 ,true},
	{ ARM::VST4q32oddPseudo_UPD, ARM::VST4q32_UPD, false, true, true, OddDblSpc, 4, 2 ,true},
	{ ARM::VST4q8Pseudo_UPD, ARM::VST4q8_UPD, false, true, true, EvenDblSpc, 4, 8 ,true},
	{ ARM::VST4q8oddPseudo, ARM::VST4q8, false, false, false, OddDblSpc, 4, 8 ,true},
	{ ARM::VST4q8oddPseudo_UPD, ARM::VST4q8_UPD, false, true, true, OddDblSpc, 4, 8 ,true}
	};

	/// LookupNEONLdSt - Search the NEONLdStTable for information about a NEON
	/// load or store pseudo instruction.
	static const NEONLdStTableEntry *LookupNEONLdSt(unsigned Opcode) {
	#ifndef NDEBUG
	// Make sure the table is sorted.
	static bool TableChecked = false;
	if (!TableChecked) {
	assert(std::is_sorted(std::begin(NEONLdStTable), std::end(NEONLdStTable)) &&
	"NEONLdStTable is not sorted!");
	TableChecked = true;
	}
	#endif

	auto I = std::lower_bound(std::begin(NEONLdStTable),
	std::end(NEONLdStTable), Opcode);
	if (I != std::end(NEONLdStTable) && I->PseudoOpc == Opcode)
	return I;
	return nullptr;
	}

	/// GetDSubRegs - Get 4 D subregisters of a Q, QQ, or QQQQ register,
	/// corresponding to the specified register spacing. Not all of the results
	/// are necessarily valid, e.g., a Q register only has 2 D subregisters.
	static void GetDSubRegs(unsigned Reg, NEONRegSpacing RegSpc,
	const TargetRegisterInfo *TRI, unsigned &D0,
	unsigned &D1, unsigned &D2, unsigned &D3) {
	if (RegSpc == SingleSpc) {
	D0 = TRI->getSubReg(Reg, ARM::dsub_0);
	D1 = TRI->getSubReg(Reg, ARM::dsub_1);
	D2 = TRI->getSubReg(Reg, ARM::dsub_2);
	D3 = TRI->getSubReg(Reg, ARM::dsub_3);
	} else if (RegSpc == EvenDblSpc) {
	D0 = TRI->getSubReg(Reg, ARM::dsub_0);
	D1 = TRI->getSubReg(Reg, ARM::dsub_2);
	D2 = TRI->getSubReg(Reg, ARM::dsub_4);
	D3 = TRI->getSubReg(Reg, ARM::dsub_6);
	} else {
	assert(RegSpc == OddDblSpc && "unknown register spacing");
	D0 = TRI->getSubReg(Reg, ARM::dsub_1);
	D1 = TRI->getSubReg(Reg, ARM::dsub_3);
	D2 = TRI->getSubReg(Reg, ARM::dsub_5);
	D3 = TRI->getSubReg(Reg, ARM::dsub_7);
	}
	}

	/// ExpandVLD - Translate VLD pseudo instructions with Q, QQ or QQQQ register
	/// operands to real VLD instructions with D register operands.
	void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) {
	MachineInstr &MI = *MBBI;
	MachineBasicBlock &MBB = *MI.getParent();

	const NEONLdStTableEntry *TableEntry = LookupNEONLdSt(MI.getOpcode());
	assert(TableEntry && TableEntry->IsLoad && "NEONLdStTable lookup failed");
	NEONRegSpacing RegSpc = (NEONRegSpacing)TableEntry->RegSpacing;
	unsigned NumRegs = TableEntry->NumRegs;

	MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(),
	TII->get(TableEntry->RealOpc));
	unsigned OpIdx = 0;

	bool DstIsDead = MI.getOperand(OpIdx).isDead();
	unsigned DstReg = MI.getOperand(OpIdx++).getReg();
	unsigned D0, D1, D2, D3;
	GetDSubRegs(DstReg, RegSpc, TRI, D0, D1, D2, D3);
	MIB.addReg(D0, RegState::Define \| getDeadRegState(DstIsDead));
	if (NumRegs > 1 && TableEntry->copyAllListRegs)
	MIB.addReg(D1, RegState::Define \| getDeadRegState(DstIsDead));
	if (NumRegs > 2 && TableEntry->copyAllListRegs)
	MIB.addReg(D2, RegState::Define \| getDeadRegState(DstIsDead));
	if (NumRegs > 3 && TableEntry->copyAllListRegs)
	MIB.addReg(D3, RegState::Define \| getDeadRegState(DstIsDead));

	if (TableEntry->isUpdating)
	MIB.add(MI.getOperand(OpIdx++));

	// Copy the addrmode6 operands.
	MIB.add(MI.getOperand(OpIdx++));
	MIB.add(MI.getOperand(OpIdx++));
	// Copy the am6offset operand.
	if (TableEntry->hasWritebackOperand)
	MIB.add(MI.getOperand(OpIdx++));

	// For an instruction writing double-spaced subregs, the pseudo instruction
	// has an extra operand that is a use of the super-register. Record the
	// operand index and skip over it.
	unsigned SrcOpIdx = 0;
	if (RegSpc == EvenDblSpc \|\| RegSpc == OddDblSpc)
	SrcOpIdx = OpIdx++;

	// Copy the predicate operands.
	MIB.add(MI.getOperand(OpIdx++));
	MIB.add(MI.getOperand(OpIdx++));

	// Copy the super-register source operand used for double-spaced subregs over
	// to the new instruction as an implicit operand.
	if (SrcOpIdx != 0) {
	MachineOperand MO = MI.getOperand(SrcOpIdx);
	MO.setImplicit(true);
	MIB.add(MO);
	}
	// Add an implicit def for the super-register.
	MIB.addReg(DstReg, RegState::ImplicitDefine \| getDeadRegState(DstIsDead));
	TransferImpOps(MI, MIB, MIB);

	// Transfer memoperands.
	MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());

	MI.eraseFromParent();
	}

	/// ExpandVST - Translate VST pseudo instructions with Q, QQ or QQQQ register
	/// operands to real VST instructions with D register operands.
	void ARMExpandPseudo::ExpandVST(MachineBasicBlock::iterator &MBBI) {
	MachineInstr &MI = *MBBI;
	MachineBasicBlock &MBB = *MI.getParent();

	const NEONLdStTableEntry *TableEntry = LookupNEONLdSt(MI.getOpcode());
	assert(TableEntry && !TableEntry->IsLoad && "NEONLdStTable lookup failed");
	NEONRegSpacing RegSpc = (NEONRegSpacing)TableEntry->RegSpacing;
	unsigned NumRegs = TableEntry->NumRegs;

	MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(),
	TII->get(TableEntry->RealOpc));
	unsigned OpIdx = 0;
	if (TableEntry->isUpdating)
	MIB.add(MI.getOperand(OpIdx++));

	// Copy the addrmode6 operands.
	MIB.add(MI.getOperand(OpIdx++));
	MIB.add(MI.getOperand(OpIdx++));
	// Copy the am6offset operand.
	if (TableEntry->hasWritebackOperand)
	MIB.add(MI.getOperand(OpIdx++));

	bool SrcIsKill = MI.getOperand(OpIdx).isKill();
	bool SrcIsUndef = MI.getOperand(OpIdx).isUndef();
	unsigned SrcReg = MI.getOperand(OpIdx++).getReg();
	unsigned D0, D1, D2, D3;
	GetDSubRegs(SrcReg, RegSpc, TRI, D0, D1, D2, D3);
	MIB.addReg(D0, getUndefRegState(SrcIsUndef));
	if (NumRegs > 1 && TableEntry->copyAllListRegs)
	MIB.addReg(D1, getUndefRegState(SrcIsUndef));
	if (NumRegs > 2 && TableEntry->copyAllListRegs)
	MIB.addReg(D2, getUndefRegState(SrcIsUndef));
	if (NumRegs > 3 && TableEntry->copyAllListRegs)
	MIB.addReg(D3, getUndefRegState(SrcIsUndef));

	// Copy the predicate operands.
	MIB.add(MI.getOperand(OpIdx++));
	MIB.add(MI.getOperand(OpIdx++));

	if (SrcIsKill && !SrcIsUndef) // Add an implicit kill for the super-reg.
	MIB->addRegisterKilled(SrcReg, TRI, true);
	else if (!SrcIsUndef)
	MIB.addReg(SrcReg, RegState::Implicit); // Add implicit uses for src reg.
	TransferImpOps(MI, MIB, MIB);

	// Transfer memoperands.
	MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());

	MI.eraseFromParent();
	}

	/// ExpandLaneOp - Translate VLDLN and VSTLN instructions with Q, QQ or QQQQ
	/// register operands to real instructions with D register operands.
	void ARMExpandPseudo::ExpandLaneOp(MachineBasicBlock::iterator &MBBI) {
	MachineInstr &MI = *MBBI;
	MachineBasicBlock &MBB = *MI.getParent();

	const NEONLdStTableEntry *TableEntry = LookupNEONLdSt(MI.getOpcode());
	assert(TableEntry && "NEONLdStTable lookup failed");
	NEONRegSpacing RegSpc = (NEONRegSpacing)TableEntry->RegSpacing;
	unsigned NumRegs = TableEntry->NumRegs;
	unsigned RegElts = TableEntry->RegElts;

	MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(),
	TII->get(TableEntry->RealOpc));
	unsigned OpIdx = 0;
	// The lane operand is always the 3rd from last operand, before the 2
	// predicate operands.
	unsigned Lane = MI.getOperand(MI.getDesc().getNumOperands() - 3).getImm();

	// Adjust the lane and spacing as needed for Q registers.
	assert(RegSpc != OddDblSpc && "unexpected register spacing for VLD/VST-lane");
	if (RegSpc == EvenDblSpc && Lane >= RegElts) {
	RegSpc = OddDblSpc;
	Lane -= RegElts;
	}
	assert(Lane < RegElts && "out of range lane for VLD/VST-lane");

	unsigned D0 = 0, D1 = 0, D2 = 0, D3 = 0;
	unsigned DstReg = 0;
	bool DstIsDead = false;
	if (TableEntry->IsLoad) {
	DstIsDead = MI.getOperand(OpIdx).isDead();
	DstReg = MI.getOperand(OpIdx++).getReg();
	GetDSubRegs(DstReg, RegSpc, TRI, D0, D1, D2, D3);
	MIB.addReg(D0, RegState::Define \| getDeadRegState(DstIsDead));
	if (NumRegs > 1)
	MIB.addReg(D1, RegState::Define \| getDeadRegState(DstIsDead));
	if (NumRegs > 2)
	MIB.addReg(D2, RegState::Define \| getDeadRegState(DstIsDead));
	if (NumRegs > 3)
	MIB.addReg(D3, RegState::Define \| getDeadRegState(DstIsDead));
	}

	if (TableEntry->isUpdating)
	MIB.add(MI.getOperand(OpIdx++));

	// Copy the addrmode6 operands.
	MIB.add(MI.getOperand(OpIdx++));
	MIB.add(MI.getOperand(OpIdx++));
	// Copy the am6offset operand.
	if (TableEntry->hasWritebackOperand)
	MIB.add(MI.getOperand(OpIdx++));

	// Grab the super-register source.
	MachineOperand MO = MI.getOperand(OpIdx++);
	if (!TableEntry->IsLoad)
	GetDSubRegs(MO.getReg(), RegSpc, TRI, D0, D1, D2, D3);

	// Add the subregs as sources of the new instruction.
	unsigned SrcFlags = (getUndefRegState(MO.isUndef()) \|
	getKillRegState(MO.isKill()));
	MIB.addReg(D0, SrcFlags);
	if (NumRegs > 1)
	MIB.addReg(D1, SrcFlags);
	if (NumRegs > 2)
	MIB.addReg(D2, SrcFlags);
	if (NumRegs > 3)
	MIB.addReg(D3, SrcFlags);

	// Add the lane number operand.
	MIB.addImm(Lane);
	OpIdx += 1;

	// Copy the predicate operands.
	MIB.add(MI.getOperand(OpIdx++));
	MIB.add(MI.getOperand(OpIdx++));

	// Copy the super-register source to be an implicit source.
	MO.setImplicit(true);
	MIB.add(MO);
	if (TableEntry->IsLoad)
	// Add an implicit def for the super-register.
	MIB.addReg(DstReg, RegState::ImplicitDefine \| getDeadRegState(DstIsDead));
	TransferImpOps(MI, MIB, MIB);
	// Transfer memoperands.
	MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
	MI.eraseFromParent();
	}

	/// ExpandVTBL - Translate VTBL and VTBX pseudo instructions with Q or QQ
	/// register operands to real instructions with D register operands.
	void ARMExpandPseudo::ExpandVTBL(MachineBasicBlock::iterator &MBBI,
	unsigned Opc, bool IsExt) {
	MachineInstr &MI = *MBBI;
	MachineBasicBlock &MBB = *MI.getParent();

	MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc));
	unsigned OpIdx = 0;

	// Transfer the destination register operand.
	MIB.add(MI.getOperand(OpIdx++));
	if (IsExt)
	MIB.add(MI.getOperand(OpIdx++));

	bool SrcIsKill = MI.getOperand(OpIdx).isKill();
	unsigned SrcReg = MI.getOperand(OpIdx++).getReg();
	unsigned D0, D1, D2, D3;
	GetDSubRegs(SrcReg, SingleSpc, TRI, D0, D1, D2, D3);
	MIB.addReg(D0);

	// Copy the other source register operand.
	MIB.add(MI.getOperand(OpIdx++));

	// Copy the predicate operands.
	MIB.add(MI.getOperand(OpIdx++));
	MIB.add(MI.getOperand(OpIdx++));

	// Add an implicit kill and use for the super-reg.
	MIB.addReg(SrcReg, RegState::Implicit \| getKillRegState(SrcIsKill));
	TransferImpOps(MI, MIB, MIB);
	MI.eraseFromParent();
	}

	static bool IsAnAddressOperand(const MachineOperand &MO) {
	// This check is overly conservative. Unless we are certain that the machine
	// operand is not a symbol reference, we return that it is a symbol reference.
	// This is important as the load pair may not be split up Windows.
	switch (MO.getType()) {
	case MachineOperand::MO_Register:
	case MachineOperand::MO_Immediate:
	case MachineOperand::MO_CImmediate:
	case MachineOperand::MO_FPImmediate:
	return false;
	case MachineOperand::MO_MachineBasicBlock:
	return true;
	case MachineOperand::MO_FrameIndex:
	return false;
	case MachineOperand::MO_ConstantPoolIndex:
	case MachineOperand::MO_TargetIndex:
	case MachineOperand::MO_JumpTableIndex:
	case MachineOperand::MO_ExternalSymbol:
	case MachineOperand::MO_GlobalAddress:
	case MachineOperand::MO_BlockAddress:
	return true;
	case MachineOperand::MO_RegisterMask:
	case MachineOperand::MO_RegisterLiveOut:
	return false;
	case MachineOperand::MO_Metadata:
	case MachineOperand::MO_MCSymbol:
	return true;
	case MachineOperand::MO_CFIIndex:
	return false;
	case MachineOperand::MO_IntrinsicID:
	case MachineOperand::MO_Predicate:
	llvm_unreachable("should not exist post-isel");
	}
	llvm_unreachable("unhandled machine operand type");
	}

	void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator &MBBI) {
	MachineInstr &MI = *MBBI;
	unsigned Opcode = MI.getOpcode();
	unsigned PredReg = 0;
	ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
	unsigned DstReg = MI.getOperand(0).getReg();
	bool DstIsDead = MI.getOperand(0).isDead();
	bool isCC = Opcode == ARM::MOVCCi32imm \|\| Opcode == ARM::t2MOVCCi32imm;
	const MachineOperand &MO = MI.getOperand(isCC ? 2 : 1);
	bool RequiresBundling = STI->isTargetWindows() && IsAnAddressOperand(MO);
	MachineInstrBuilder LO16, HI16;

	if (!STI->hasV6T2Ops() &&
	(Opcode == ARM::MOVi32imm \|\| Opcode == ARM::MOVCCi32imm)) {
	// FIXME Windows CE supports older ARM CPUs
	assert(!STI->isTargetWindows() && "Windows on ARM requires ARMv7+");

	// Expand into a movi + orr.
	LO16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVi), DstReg);
	HI16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::ORRri))
	.addReg(DstReg, RegState::Define \| getDeadRegState(DstIsDead))
	.addReg(DstReg);

	assert (MO.isImm() && "MOVi32imm w/ non-immediate source operand!");
	unsigned ImmVal = (unsigned)MO.getImm();
	unsigned SOImmValV1 = ARM_AM::getSOImmTwoPartFirst(ImmVal);
	unsigned SOImmValV2 = ARM_AM::getSOImmTwoPartSecond(ImmVal);
	LO16 = LO16.addImm(SOImmValV1);
	HI16 = HI16.addImm(SOImmValV2);
	LO16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
	HI16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
	LO16.addImm(Pred).addReg(PredReg).add(condCodeOp());
	HI16.addImm(Pred).addReg(PredReg).add(condCodeOp());
	TransferImpOps(MI, LO16, HI16);
	MI.eraseFromParent();
	return;
	}

	unsigned LO16Opc = 0;
	unsigned HI16Opc = 0;
	if (Opcode == ARM::t2MOVi32imm \|\| Opcode == ARM::t2MOVCCi32imm) {
	LO16Opc = ARM::t2MOVi16;
	HI16Opc = ARM::t2MOVTi16;
	} else {
	LO16Opc = ARM::MOVi16;
	HI16Opc = ARM::MOVTi16;
	}

	LO16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(LO16Opc), DstReg);
	HI16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(HI16Opc))
	.addReg(DstReg, RegState::Define \| getDeadRegState(DstIsDead))
	.addReg(DstReg);

	switch (MO.getType()) {
	case MachineOperand::MO_Immediate: {
	unsigned Imm = MO.getImm();
	unsigned Lo16 = Imm & 0xffff;
	unsigned Hi16 = (Imm >> 16) & 0xffff;
	LO16 = LO16.addImm(Lo16);
	HI16 = HI16.addImm(Hi16);
	break;
	}
	case MachineOperand::MO_ExternalSymbol: {
	const char *ES = MO.getSymbolName();
	unsigned TF = MO.getTargetFlags();
	LO16 = LO16.addExternalSymbol(ES, TF \| ARMII::MO_LO16);
	HI16 = HI16.addExternalSymbol(ES, TF \| ARMII::MO_HI16);
	break;
	}
	default: {
	const GlobalValue *GV = MO.getGlobal();
	unsigned TF = MO.getTargetFlags();
	LO16 = LO16.addGlobalAddress(GV, MO.getOffset(), TF \| ARMII::MO_LO16);
	HI16 = HI16.addGlobalAddress(GV, MO.getOffset(), TF \| ARMII::MO_HI16);
	break;
	}
	}

	LO16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
	HI16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
	LO16.addImm(Pred).addReg(PredReg);
	HI16.addImm(Pred).addReg(PredReg);

	if (RequiresBundling)
	finalizeBundle(MBB, LO16->getIterator(), MBBI->getIterator());

	TransferImpOps(MI, LO16, HI16);
	MI.eraseFromParent();
	}

	/// Expand a CMP_SWAP pseudo-inst to an ldrex/strex loop as simply as
	/// possible. This only gets used at -O0 so we don't care about efficiency of
	/// the generated code.
	bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MBBI,
	unsigned LdrexOp, unsigned StrexOp,
	unsigned UxtOp,
	MachineBasicBlock::iterator &NextMBBI) {
	bool IsThumb = STI->isThumb();
	MachineInstr &MI = *MBBI;
	DebugLoc DL = MI.getDebugLoc();
	const MachineOperand &Dest = MI.getOperand(0);
	- unsigned StatusReg = MI.getOperand(1).getReg();
	- bool StatusDead = MI.getOperand(1).isDead();
	+ unsigned TempReg = MI.getOperand(1).getReg();
	// Duplicating undef operands into 2 instructions does not guarantee the same
	// value on both; However undef should be replaced by xzr anyway.
	assert(!MI.getOperand(2).isUndef() && "cannot handle undef");
	unsigned AddrReg = MI.getOperand(2).getReg();
	unsigned DesiredReg = MI.getOperand(3).getReg();
	unsigned NewReg = MI.getOperand(4).getReg();

	MachineFunction *MF = MBB.getParent();
	auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
	auto StoreBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
	auto DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());

	MF->insert(++MBB.getIterator(), LoadCmpBB);
	MF->insert(++LoadCmpBB->getIterator(), StoreBB);
	MF->insert(++StoreBB->getIterator(), DoneBB);

	if (UxtOp) {
	MachineInstrBuilder MIB =
	BuildMI(MBB, MBBI, DL, TII->get(UxtOp), DesiredReg)
	.addReg(DesiredReg, RegState::Kill);
	if (!IsThumb)
	MIB.addImm(0);
	MIB.add(predOps(ARMCC::AL));
	}

	// .Lloadcmp:
	- // mov wStatus, #0
	// ldrex rDest, [rAddr]
	// cmp rDest, rDesired
	// bne .Ldone
	- if (!StatusDead) {
	- if (IsThumb) {
	- BuildMI(LoadCmpBB, DL, TII->get(ARM::tMOVi8), StatusReg)
	- .addDef(ARM::CPSR, RegState::Dead)
	- .addImm(0)
	- .add(predOps(ARMCC::AL));
	- } else {
	- BuildMI(LoadCmpBB, DL, TII->get(ARM::MOVi), StatusReg)
	- .addImm(0)
	- .add(predOps(ARMCC::AL))
	- .add(condCodeOp());
	- }
	- }

	MachineInstrBuilder MIB;
	MIB = BuildMI(LoadCmpBB, DL, TII->get(LdrexOp), Dest.getReg());
	MIB.addReg(AddrReg);
	if (LdrexOp == ARM::t2LDREX)
	MIB.addImm(0); // a 32-bit Thumb ldrex (only) allows an offset.
	MIB.add(predOps(ARMCC::AL));

	unsigned CMPrr = IsThumb ? ARM::tCMPhir : ARM::CMPrr;
	BuildMI(LoadCmpBB, DL, TII->get(CMPrr))
	.addReg(Dest.getReg(), getKillRegState(Dest.isDead()))
	.addReg(DesiredReg)
	.add(predOps(ARMCC::AL));
	unsigned Bcc = IsThumb ? ARM::tBcc : ARM::Bcc;
	BuildMI(LoadCmpBB, DL, TII->get(Bcc))
	.addMBB(DoneBB)
	.addImm(ARMCC::NE)
	.addReg(ARM::CPSR, RegState::Kill);
	LoadCmpBB->addSuccessor(DoneBB);
	LoadCmpBB->addSuccessor(StoreBB);

	// .Lstore:
	- // strex rStatus, rNew, [rAddr]
	- // cmp rStatus, #0
	+ // strex rTempReg, rNew, [rAddr]
	+ // cmp rTempReg, #0
	// bne .Lloadcmp
	- MIB = BuildMI(StoreBB, DL, TII->get(StrexOp), StatusReg)
	+ MIB = BuildMI(StoreBB, DL, TII->get(StrexOp), TempReg)
	.addReg(NewReg)
	.addReg(AddrReg);
	if (StrexOp == ARM::t2STREX)
	MIB.addImm(0); // a 32-bit Thumb strex (only) allows an offset.
	MIB.add(predOps(ARMCC::AL));

	unsigned CMPri = IsThumb ? ARM::t2CMPri : ARM::CMPri;
	BuildMI(StoreBB, DL, TII->get(CMPri))
	- .addReg(StatusReg, getKillRegState(StatusDead))
	+ .addReg(TempReg, RegState::Kill)
	.addImm(0)
	.add(predOps(ARMCC::AL));
	BuildMI(StoreBB, DL, TII->get(Bcc))
	.addMBB(LoadCmpBB)
	.addImm(ARMCC::NE)
	.addReg(ARM::CPSR, RegState::Kill);
	StoreBB->addSuccessor(LoadCmpBB);
	StoreBB->addSuccessor(DoneBB);

	DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end());
	DoneBB->transferSuccessors(&MBB);

	MBB.addSuccessor(LoadCmpBB);

	NextMBBI = MBB.end();
	MI.eraseFromParent();

	// Recompute livein lists.
	const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
	LivePhysRegs LiveRegs;
	computeLiveIns(LiveRegs, MRI, *DoneBB);
	computeLiveIns(LiveRegs, MRI, *StoreBB);
	computeLiveIns(LiveRegs, MRI, *LoadCmpBB);
	// Do an extra pass around the loop to get loop carried registers right.
	StoreBB->clearLiveIns();
	computeLiveIns(LiveRegs, MRI, *StoreBB);
	LoadCmpBB->clearLiveIns();
	computeLiveIns(LiveRegs, MRI, *LoadCmpBB);

	return true;
	}

	/// ARM's ldrexd/strexd take a consecutive register pair (represented as a
	/// single GPRPair register), Thumb's take two separate registers so we need to
	/// extract the subregs from the pair.
	static void addExclusiveRegPair(MachineInstrBuilder &MIB, MachineOperand &Reg,
	unsigned Flags, bool IsThumb,
	const TargetRegisterInfo *TRI) {
	if (IsThumb) {
	unsigned RegLo = TRI->getSubReg(Reg.getReg(), ARM::gsub_0);
	unsigned RegHi = TRI->getSubReg(Reg.getReg(), ARM::gsub_1);
	MIB.addReg(RegLo, Flags \| getKillRegState(Reg.isDead()));
	MIB.addReg(RegHi, Flags \| getKillRegState(Reg.isDead()));
	} else
	MIB.addReg(Reg.getReg(), Flags \| getKillRegState(Reg.isDead()));
	}

	/// Expand a 64-bit CMP_SWAP to an ldrexd/strexd loop.
	bool ARMExpandPseudo::ExpandCMP_SWAP_64(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MBBI,
	MachineBasicBlock::iterator &NextMBBI) {
	bool IsThumb = STI->isThumb();
	MachineInstr &MI = *MBBI;
	DebugLoc DL = MI.getDebugLoc();
	MachineOperand &Dest = MI.getOperand(0);
	- unsigned StatusReg = MI.getOperand(1).getReg();
	- bool StatusDead = MI.getOperand(1).isDead();
	+ unsigned TempReg = MI.getOperand(1).getReg();
	// Duplicating undef operands into 2 instructions does not guarantee the same
	// value on both; However undef should be replaced by xzr anyway.
	assert(!MI.getOperand(2).isUndef() && "cannot handle undef");
	unsigned AddrReg = MI.getOperand(2).getReg();
	unsigned DesiredReg = MI.getOperand(3).getReg();
	MachineOperand New = MI.getOperand(4);
	New.setIsKill(false);

	unsigned DestLo = TRI->getSubReg(Dest.getReg(), ARM::gsub_0);
	unsigned DestHi = TRI->getSubReg(Dest.getReg(), ARM::gsub_1);
	unsigned DesiredLo = TRI->getSubReg(DesiredReg, ARM::gsub_0);
	unsigned DesiredHi = TRI->getSubReg(DesiredReg, ARM::gsub_1);

	MachineFunction *MF = MBB.getParent();
	auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
	auto StoreBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
	auto DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());

	MF->insert(++MBB.getIterator(), LoadCmpBB);
	MF->insert(++LoadCmpBB->getIterator(), StoreBB);
	MF->insert(++StoreBB->getIterator(), DoneBB);

	// .Lloadcmp:
	// ldrexd rDestLo, rDestHi, [rAddr]
	// cmp rDestLo, rDesiredLo
	- // sbcs rStatus<dead>, rDestHi, rDesiredHi
	+ // sbcs rTempReg<dead>, rDestHi, rDesiredHi
	// bne .Ldone
	unsigned LDREXD = IsThumb ? ARM::t2LDREXD : ARM::LDREXD;
	MachineInstrBuilder MIB;
	MIB = BuildMI(LoadCmpBB, DL, TII->get(LDREXD));
	addExclusiveRegPair(MIB, Dest, RegState::Define, IsThumb, TRI);
	MIB.addReg(AddrReg).add(predOps(ARMCC::AL));

	unsigned CMPrr = IsThumb ? ARM::tCMPhir : ARM::CMPrr;
	BuildMI(LoadCmpBB, DL, TII->get(CMPrr))
	.addReg(DestLo, getKillRegState(Dest.isDead()))
	.addReg(DesiredLo)
	.add(predOps(ARMCC::AL));

	BuildMI(LoadCmpBB, DL, TII->get(CMPrr))
	.addReg(DestHi, getKillRegState(Dest.isDead()))
	.addReg(DesiredHi)
	.addImm(ARMCC::EQ).addReg(ARM::CPSR, RegState::Kill);

	unsigned Bcc = IsThumb ? ARM::tBcc : ARM::Bcc;
	BuildMI(LoadCmpBB, DL, TII->get(Bcc))
	.addMBB(DoneBB)
	.addImm(ARMCC::NE)
	.addReg(ARM::CPSR, RegState::Kill);
	LoadCmpBB->addSuccessor(DoneBB);
	LoadCmpBB->addSuccessor(StoreBB);

	// .Lstore:
	- // strexd rStatus, rNewLo, rNewHi, [rAddr]
	- // cmp rStatus, #0
	+ // strexd rTempReg, rNewLo, rNewHi, [rAddr]
	+ // cmp rTempReg, #0
	// bne .Lloadcmp
	unsigned STREXD = IsThumb ? ARM::t2STREXD : ARM::STREXD;
	- MIB = BuildMI(StoreBB, DL, TII->get(STREXD), StatusReg);
	+ MIB = BuildMI(StoreBB, DL, TII->get(STREXD), TempReg);
	addExclusiveRegPair(MIB, New, 0, IsThumb, TRI);
	MIB.addReg(AddrReg).add(predOps(ARMCC::AL));

	unsigned CMPri = IsThumb ? ARM::t2CMPri : ARM::CMPri;
	BuildMI(StoreBB, DL, TII->get(CMPri))
	- .addReg(StatusReg, getKillRegState(StatusDead))
	+ .addReg(TempReg, RegState::Kill)
	.addImm(0)
	.add(predOps(ARMCC::AL));
	BuildMI(StoreBB, DL, TII->get(Bcc))
	.addMBB(LoadCmpBB)
	.addImm(ARMCC::NE)
	.addReg(ARM::CPSR, RegState::Kill);
	StoreBB->addSuccessor(LoadCmpBB);
	StoreBB->addSuccessor(DoneBB);

	DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end());
	DoneBB->transferSuccessors(&MBB);

	MBB.addSuccessor(LoadCmpBB);

	NextMBBI = MBB.end();
	MI.eraseFromParent();

	// Recompute livein lists.
	const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
	LivePhysRegs LiveRegs;
	computeLiveIns(LiveRegs, MRI, *DoneBB);
	computeLiveIns(LiveRegs, MRI, *StoreBB);
	computeLiveIns(LiveRegs, MRI, *LoadCmpBB);
	// Do an extra pass around the loop to get loop carried registers right.
	StoreBB->clearLiveIns();
	computeLiveIns(LiveRegs, MRI, *StoreBB);
	LoadCmpBB->clearLiveIns();
	computeLiveIns(LiveRegs, MRI, *LoadCmpBB);

	return true;
	}


	bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MBBI,
	MachineBasicBlock::iterator &NextMBBI) {
	MachineInstr &MI = *MBBI;
	unsigned Opcode = MI.getOpcode();
	switch (Opcode) {
	default:
	return false;

	case ARM::TCRETURNdi:
	case ARM::TCRETURNri: {
	MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
	assert(MBBI->isReturn() &&
	"Can only insert epilog into returning blocks");
	unsigned RetOpcode = MBBI->getOpcode();
	DebugLoc dl = MBBI->getDebugLoc();
	const ARMBaseInstrInfo &TII = static_cast<const ARMBaseInstrInfo >(
	MBB.getParent()->getSubtarget().getInstrInfo());

	// Tail call return: adjust the stack pointer and jump to callee.
	MBBI = MBB.getLastNonDebugInstr();
	MachineOperand &JumpTarget = MBBI->getOperand(0);

	// Jump to label or value in register.
	if (RetOpcode == ARM::TCRETURNdi) {
	unsigned TCOpcode =
	STI->isThumb()
	? (STI->isTargetMachO() ? ARM::tTAILJMPd : ARM::tTAILJMPdND)
	: ARM::TAILJMPd;
	MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(TCOpcode));
	if (JumpTarget.isGlobal())
	MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(),
	JumpTarget.getTargetFlags());
	else {
	assert(JumpTarget.isSymbol());
	MIB.addExternalSymbol(JumpTarget.getSymbolName(),
	JumpTarget.getTargetFlags());
	}

	// Add the default predicate in Thumb mode.
	if (STI->isThumb())
	MIB.add(predOps(ARMCC::AL));
	} else if (RetOpcode == ARM::TCRETURNri) {
	BuildMI(MBB, MBBI, dl,
	TII.get(STI->isThumb() ? ARM::tTAILJMPr : ARM::TAILJMPr))
	.addReg(JumpTarget.getReg(), RegState::Kill);
	}

	auto NewMI = std::prev(MBBI);
	for (unsigned i = 1, e = MBBI->getNumOperands(); i != e; ++i)
	NewMI->addOperand(MBBI->getOperand(i));

	// Delete the pseudo instruction TCRETURN.
	MBB.erase(MBBI);
	MBBI = NewMI;
	return true;
	}
	case ARM::VMOVScc:
	case ARM::VMOVDcc: {
	unsigned newOpc = Opcode == ARM::VMOVScc ? ARM::VMOVS : ARM::VMOVD;
	BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(newOpc),
	MI.getOperand(1).getReg())
	.add(MI.getOperand(2))
	.addImm(MI.getOperand(3).getImm()) // 'pred'
	.add(MI.getOperand(4));

	MI.eraseFromParent();
	return true;
	}
	case ARM::t2MOVCCr:
	case ARM::MOVCCr: {
	unsigned Opc = AFI->isThumbFunction() ? ARM::t2MOVr : ARM::MOVr;
	BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc),
	MI.getOperand(1).getReg())
	.add(MI.getOperand(2))
	.addImm(MI.getOperand(3).getImm()) // 'pred'
	.add(MI.getOperand(4))
	.add(condCodeOp()); // 's' bit

	MI.eraseFromParent();
	return true;
	}
	case ARM::MOVCCsi: {
	BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVsi),
	(MI.getOperand(1).getReg()))
	.add(MI.getOperand(2))
	.addImm(MI.getOperand(3).getImm())
	.addImm(MI.getOperand(4).getImm()) // 'pred'
	.add(MI.getOperand(5))
	.add(condCodeOp()); // 's' bit

	MI.eraseFromParent();
	return true;
	}
	case ARM::MOVCCsr: {
	BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVsr),
	(MI.getOperand(1).getReg()))
	.add(MI.getOperand(2))
	.add(MI.getOperand(3))
	.addImm(MI.getOperand(4).getImm())
	.addImm(MI.getOperand(5).getImm()) // 'pred'
	.add(MI.getOperand(6))
	.add(condCodeOp()); // 's' bit

	MI.eraseFromParent();
	return true;
	}
	case ARM::t2MOVCCi16:
	case ARM::MOVCCi16: {
	unsigned NewOpc = AFI->isThumbFunction() ? ARM::t2MOVi16 : ARM::MOVi16;
	BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc),
	MI.getOperand(1).getReg())
	.addImm(MI.getOperand(2).getImm())
	.addImm(MI.getOperand(3).getImm()) // 'pred'
	.add(MI.getOperand(4));
	MI.eraseFromParent();
	return true;
	}
	case ARM::t2MOVCCi:
	case ARM::MOVCCi: {
	unsigned Opc = AFI->isThumbFunction() ? ARM::t2MOVi : ARM::MOVi;
	BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc),
	MI.getOperand(1).getReg())
	.addImm(MI.getOperand(2).getImm())
	.addImm(MI.getOperand(3).getImm()) // 'pred'
	.add(MI.getOperand(4))
	.add(condCodeOp()); // 's' bit

	MI.eraseFromParent();
	return true;
	}
	case ARM::t2MVNCCi:
	case ARM::MVNCCi: {
	unsigned Opc = AFI->isThumbFunction() ? ARM::t2MVNi : ARM::MVNi;
	BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc),
	MI.getOperand(1).getReg())
	.addImm(MI.getOperand(2).getImm())
	.addImm(MI.getOperand(3).getImm()) // 'pred'
	.add(MI.getOperand(4))
	.add(condCodeOp()); // 's' bit

	MI.eraseFromParent();
	return true;
	}
	case ARM::t2MOVCClsl:
	case ARM::t2MOVCClsr:
	case ARM::t2MOVCCasr:
	case ARM::t2MOVCCror: {
	unsigned NewOpc;
	switch (Opcode) {
	case ARM::t2MOVCClsl: NewOpc = ARM::t2LSLri; break;
	case ARM::t2MOVCClsr: NewOpc = ARM::t2LSRri; break;
	case ARM::t2MOVCCasr: NewOpc = ARM::t2ASRri; break;
	case ARM::t2MOVCCror: NewOpc = ARM::t2RORri; break;
	default: llvm_unreachable("unexpeced conditional move");
	}
	BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc),
	MI.getOperand(1).getReg())
	.add(MI.getOperand(2))
	.addImm(MI.getOperand(3).getImm())
	.addImm(MI.getOperand(4).getImm()) // 'pred'
	.add(MI.getOperand(5))
	.add(condCodeOp()); // 's' bit
	MI.eraseFromParent();
	return true;
	}
	case ARM::Int_eh_sjlj_dispatchsetup: {
	MachineFunction &MF = *MI.getParent()->getParent();
	const ARMBaseInstrInfo *AII =
	static_cast<const ARMBaseInstrInfo*>(TII);
	const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
	// For functions using a base pointer, we rematerialize it (via the frame
	// pointer) here since eh.sjlj.setjmp and eh.sjlj.longjmp don't do it
	// for us. Otherwise, expand to nothing.
	if (RI.hasBasePointer(MF)) {
	int32_t NumBytes = AFI->getFramePtrSpillOffset();
	unsigned FramePtr = RI.getFrameRegister(MF);
	assert(MF.getSubtarget().getFrameLowering()->hasFP(MF) &&
	"base pointer without frame pointer?");

	if (AFI->isThumb2Function()) {
	emitT2RegPlusImmediate(MBB, MBBI, MI.getDebugLoc(), ARM::R6,
	FramePtr, -NumBytes, ARMCC::AL, 0, *TII);
	} else if (AFI->isThumbFunction()) {
	emitThumbRegPlusImmediate(MBB, MBBI, MI.getDebugLoc(), ARM::R6,
	FramePtr, -NumBytes, *TII, RI);
	} else {
	emitARMRegPlusImmediate(MBB, MBBI, MI.getDebugLoc(), ARM::R6,
	FramePtr, -NumBytes, ARMCC::AL, 0,
	*TII);
	}
	// If there's dynamic realignment, adjust for it.
	if (RI.needsStackRealignment(MF)) {
	MachineFrameInfo &MFI = MF.getFrameInfo();
	unsigned MaxAlign = MFI.getMaxAlignment();
	assert (!AFI->isThumb1OnlyFunction());
	// Emit bic r6, r6, MaxAlign
	assert(MaxAlign <= 256 && "The BIC instruction cannot encode "
	"immediates larger than 256 with all lower "
	"bits set.");
	unsigned bicOpc = AFI->isThumbFunction() ?
	ARM::t2BICri : ARM::BICri;
	BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(bicOpc), ARM::R6)
	.addReg(ARM::R6, RegState::Kill)
	.addImm(MaxAlign - 1)
	.add(predOps(ARMCC::AL))
	.add(condCodeOp());
	}

	}
	MI.eraseFromParent();
	return true;
	}

	case ARM::MOVsrl_flag:
	case ARM::MOVsra_flag: {
	// These are just fancy MOVs instructions.
	BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVsi),
	MI.getOperand(0).getReg())
	.add(MI.getOperand(1))
	.addImm(ARM_AM::getSORegOpc(
	(Opcode == ARM::MOVsrl_flag ? ARM_AM::lsr : ARM_AM::asr), 1))
	.add(predOps(ARMCC::AL))
	.addReg(ARM::CPSR, RegState::Define);
	MI.eraseFromParent();
	return true;
	}
	case ARM::RRX: {
	// This encodes as "MOVs Rd, Rm, rrx
	MachineInstrBuilder MIB =
	BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVsi),
	MI.getOperand(0).getReg())
	.add(MI.getOperand(1))
	.addImm(ARM_AM::getSORegOpc(ARM_AM::rrx, 0))
	.add(predOps(ARMCC::AL))
	.add(condCodeOp());
	TransferImpOps(MI, MIB, MIB);
	MI.eraseFromParent();
	return true;
	}
	case ARM::tTPsoft:
	case ARM::TPsoft: {
	const bool Thumb = Opcode == ARM::tTPsoft;

	MachineInstrBuilder MIB;
	if (STI->genLongCalls()) {
	MachineFunction *MF = MBB.getParent();
	MachineConstantPool *MCP = MF->getConstantPool();
	unsigned PCLabelID = AFI->createPICLabelUId();
	MachineConstantPoolValue *CPV =
	ARMConstantPoolSymbol::Create(MF->getFunction()->getContext(),
	"__aeabi_read_tp", PCLabelID, 0);
	unsigned Reg = MI.getOperand(0).getReg();
	MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(),
	TII->get(Thumb ? ARM::tLDRpci : ARM::LDRi12), Reg)
	.addConstantPoolIndex(MCP->getConstantPoolIndex(CPV, 4));
	if (!Thumb)
	MIB.addImm(0);
	MIB.add(predOps(ARMCC::AL));

	MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(),
	TII->get(Thumb ? ARM::tBLXr : ARM::BLX));
	if (Thumb)
	MIB.add(predOps(ARMCC::AL));
	MIB.addReg(Reg, RegState::Kill);
	} else {
	MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(),
	TII->get(Thumb ? ARM::tBL : ARM::BL));
	if (Thumb)
	MIB.add(predOps(ARMCC::AL));
	MIB.addExternalSymbol("__aeabi_read_tp", 0);
	}

	MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
	TransferImpOps(MI, MIB, MIB);
	MI.eraseFromParent();
	return true;
	}
	case ARM::tLDRpci_pic:
	case ARM::t2LDRpci_pic: {
	unsigned NewLdOpc = (Opcode == ARM::tLDRpci_pic)
	? ARM::tLDRpci : ARM::t2LDRpci;
	unsigned DstReg = MI.getOperand(0).getReg();
	bool DstIsDead = MI.getOperand(0).isDead();
	MachineInstrBuilder MIB1 =
	BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewLdOpc), DstReg)
	.add(MI.getOperand(1))
	.add(predOps(ARMCC::AL));
	MIB1->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
	MachineInstrBuilder MIB2 =
	BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::tPICADD))
	.addReg(DstReg, RegState::Define \| getDeadRegState(DstIsDead))
	.addReg(DstReg)
	.add(MI.getOperand(2));
	TransferImpOps(MI, MIB1, MIB2);
	MI.eraseFromParent();
	return true;
	}

	case ARM::LDRLIT_ga_abs:
	case ARM::LDRLIT_ga_pcrel:
	case ARM::LDRLIT_ga_pcrel_ldr:
	case ARM::tLDRLIT_ga_abs:
	case ARM::tLDRLIT_ga_pcrel: {
	unsigned DstReg = MI.getOperand(0).getReg();
	bool DstIsDead = MI.getOperand(0).isDead();
	const MachineOperand &MO1 = MI.getOperand(1);
	const GlobalValue *GV = MO1.getGlobal();
	bool IsARM =
	Opcode != ARM::tLDRLIT_ga_pcrel && Opcode != ARM::tLDRLIT_ga_abs;
	bool IsPIC =
	Opcode != ARM::LDRLIT_ga_abs && Opcode != ARM::tLDRLIT_ga_abs;
	unsigned LDRLITOpc = IsARM ? ARM::LDRi12 : ARM::tLDRpci;
	unsigned PICAddOpc =
	IsARM
	? (Opcode == ARM::LDRLIT_ga_pcrel_ldr ? ARM::PICLDR : ARM::PICADD)
	: ARM::tPICADD;

	// We need a new const-pool entry to load from.
	MachineConstantPool *MCP = MBB.getParent()->getConstantPool();
	unsigned ARMPCLabelIndex = 0;
	MachineConstantPoolValue *CPV;

	if (IsPIC) {
	unsigned PCAdj = IsARM ? 8 : 4;
	ARMPCLabelIndex = AFI->createPICLabelUId();
	CPV = ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex,
	ARMCP::CPValue, PCAdj);
	} else
	CPV = ARMConstantPoolConstant::Create(GV, ARMCP::no_modifier);

	MachineInstrBuilder MIB =
	BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(LDRLITOpc), DstReg)
	.addConstantPoolIndex(MCP->getConstantPoolIndex(CPV, 4));
	if (IsARM)
	MIB.addImm(0);
	MIB.add(predOps(ARMCC::AL));

	if (IsPIC) {
	MachineInstrBuilder MIB =
	BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(PICAddOpc))
	.addReg(DstReg, RegState::Define \| getDeadRegState(DstIsDead))
	.addReg(DstReg)
	.addImm(ARMPCLabelIndex);

	if (IsARM)
	MIB.add(predOps(ARMCC::AL));
	}

	MI.eraseFromParent();
	return true;
	}
	case ARM::MOV_ga_pcrel:
	case ARM::MOV_ga_pcrel_ldr:
	case ARM::t2MOV_ga_pcrel: {
	// Expand into movw + movw. Also "add pc" / ldr [pc] in PIC mode.
	unsigned LabelId = AFI->createPICLabelUId();
	unsigned DstReg = MI.getOperand(0).getReg();
	bool DstIsDead = MI.getOperand(0).isDead();
	const MachineOperand &MO1 = MI.getOperand(1);
	const GlobalValue *GV = MO1.getGlobal();
	unsigned TF = MO1.getTargetFlags();
	bool isARM = Opcode != ARM::t2MOV_ga_pcrel;
	unsigned LO16Opc = isARM ? ARM::MOVi16_ga_pcrel : ARM::t2MOVi16_ga_pcrel;
	unsigned HI16Opc = isARM ? ARM::MOVTi16_ga_pcrel :ARM::t2MOVTi16_ga_pcrel;
	unsigned LO16TF = TF \| ARMII::MO_LO16;
	unsigned HI16TF = TF \| ARMII::MO_HI16;
	unsigned PICAddOpc = isARM
	? (Opcode == ARM::MOV_ga_pcrel_ldr ? ARM::PICLDR : ARM::PICADD)
	: ARM::tPICADD;
	MachineInstrBuilder MIB1 = BuildMI(MBB, MBBI, MI.getDebugLoc(),
	TII->get(LO16Opc), DstReg)
	.addGlobalAddress(GV, MO1.getOffset(), TF \| LO16TF)
	.addImm(LabelId);

	BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(HI16Opc), DstReg)
	.addReg(DstReg)
	.addGlobalAddress(GV, MO1.getOffset(), TF \| HI16TF)
	.addImm(LabelId);

	MachineInstrBuilder MIB3 = BuildMI(MBB, MBBI, MI.getDebugLoc(),
	TII->get(PICAddOpc))
	.addReg(DstReg, RegState::Define \| getDeadRegState(DstIsDead))
	.addReg(DstReg).addImm(LabelId);
	if (isARM) {
	MIB3.add(predOps(ARMCC::AL));
	if (Opcode == ARM::MOV_ga_pcrel_ldr)
	MIB3->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
	}
	TransferImpOps(MI, MIB1, MIB3);
	MI.eraseFromParent();
	return true;
	}

	case ARM::MOVi32imm:
	case ARM::MOVCCi32imm:
	case ARM::t2MOVi32imm:
	case ARM::t2MOVCCi32imm:
	ExpandMOV32BitImm(MBB, MBBI);
	return true;

	case ARM::SUBS_PC_LR: {
	MachineInstrBuilder MIB =
	BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::SUBri), ARM::PC)
	.addReg(ARM::LR)
	.add(MI.getOperand(0))
	.add(MI.getOperand(1))
	.add(MI.getOperand(2))
	.addReg(ARM::CPSR, RegState::Undef);
	TransferImpOps(MI, MIB, MIB);
	MI.eraseFromParent();
	return true;
	}
	case ARM::VLDMQIA: {
	unsigned NewOpc = ARM::VLDMDIA;
	MachineInstrBuilder MIB =
	BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc));
	unsigned OpIdx = 0;

	// Grab the Q register destination.
	bool DstIsDead = MI.getOperand(OpIdx).isDead();
	unsigned DstReg = MI.getOperand(OpIdx++).getReg();

	// Copy the source register.
	MIB.add(MI.getOperand(OpIdx++));

	// Copy the predicate operands.
	MIB.add(MI.getOperand(OpIdx++));
	MIB.add(MI.getOperand(OpIdx++));

	// Add the destination operands (D subregs).
	unsigned D0 = TRI->getSubReg(DstReg, ARM::dsub_0);
	unsigned D1 = TRI->getSubReg(DstReg, ARM::dsub_1);
	MIB.addReg(D0, RegState::Define \| getDeadRegState(DstIsDead))
	.addReg(D1, RegState::Define \| getDeadRegState(DstIsDead));

	// Add an implicit def for the super-register.
	MIB.addReg(DstReg, RegState::ImplicitDefine \| getDeadRegState(DstIsDead));
	TransferImpOps(MI, MIB, MIB);
	MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
	MI.eraseFromParent();
	return true;
	}

	case ARM::VSTMQIA: {
	unsigned NewOpc = ARM::VSTMDIA;
	MachineInstrBuilder MIB =
	BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc));
	unsigned OpIdx = 0;

	// Grab the Q register source.
	bool SrcIsKill = MI.getOperand(OpIdx).isKill();
	unsigned SrcReg = MI.getOperand(OpIdx++).getReg();

	// Copy the destination register.
	MIB.add(MI.getOperand(OpIdx++));

	// Copy the predicate operands.
	MIB.add(MI.getOperand(OpIdx++));
	MIB.add(MI.getOperand(OpIdx++));

	// Add the source operands (D subregs).
	unsigned D0 = TRI->getSubReg(SrcReg, ARM::dsub_0);
	unsigned D1 = TRI->getSubReg(SrcReg, ARM::dsub_1);
	MIB.addReg(D0, SrcIsKill ? RegState::Kill : 0)
	.addReg(D1, SrcIsKill ? RegState::Kill : 0);

	if (SrcIsKill) // Add an implicit kill for the Q register.
	MIB->addRegisterKilled(SrcReg, TRI, true);

	TransferImpOps(MI, MIB, MIB);
	MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
	MI.eraseFromParent();
	return true;
	}

	case ARM::VLD2q8Pseudo:
	case ARM::VLD2q16Pseudo:
	case ARM::VLD2q32Pseudo:
	case ARM::VLD2q8PseudoWB_fixed:
	case ARM::VLD2q16PseudoWB_fixed:
	case ARM::VLD2q32PseudoWB_fixed:
	case ARM::VLD2q8PseudoWB_register:
	case ARM::VLD2q16PseudoWB_register:
	case ARM::VLD2q32PseudoWB_register:
	case ARM::VLD3d8Pseudo:
	case ARM::VLD3d16Pseudo:
	case ARM::VLD3d32Pseudo:
	case ARM::VLD1d64TPseudo:
	case ARM::VLD1d64TPseudoWB_fixed:
	case ARM::VLD3d8Pseudo_UPD:
	case ARM::VLD3d16Pseudo_UPD:
	case ARM::VLD3d32Pseudo_UPD:
	case ARM::VLD3q8Pseudo_UPD:
	case ARM::VLD3q16Pseudo_UPD:
	case ARM::VLD3q32Pseudo_UPD:
	case ARM::VLD3q8oddPseudo:
	case ARM::VLD3q16oddPseudo:
	case ARM::VLD3q32oddPseudo:
	case ARM::VLD3q8oddPseudo_UPD:
	case ARM::VLD3q16oddPseudo_UPD:
	case ARM::VLD3q32oddPseudo_UPD:
	case ARM::VLD4d8Pseudo:
	case ARM::VLD4d16Pseudo:
	case ARM::VLD4d32Pseudo:
	case ARM::VLD1d64QPseudo:
	case ARM::VLD1d64QPseudoWB_fixed:
	case ARM::VLD4d8Pseudo_UPD:
	case ARM::VLD4d16Pseudo_UPD:
	case ARM::VLD4d32Pseudo_UPD:
	case ARM::VLD4q8Pseudo_UPD:
	case ARM::VLD4q16Pseudo_UPD:
	case ARM::VLD4q32Pseudo_UPD:
	case ARM::VLD4q8oddPseudo:
	case ARM::VLD4q16oddPseudo:
	case ARM::VLD4q32oddPseudo:
	case ARM::VLD4q8oddPseudo_UPD:
	case ARM::VLD4q16oddPseudo_UPD:
	case ARM::VLD4q32oddPseudo_UPD:
	case ARM::VLD3DUPd8Pseudo:
	case ARM::VLD3DUPd16Pseudo:
	case ARM::VLD3DUPd32Pseudo:
	case ARM::VLD3DUPd8Pseudo_UPD:
	case ARM::VLD3DUPd16Pseudo_UPD:
	case ARM::VLD3DUPd32Pseudo_UPD:
	case ARM::VLD4DUPd8Pseudo:
	case ARM::VLD4DUPd16Pseudo:
	case ARM::VLD4DUPd32Pseudo:
	case ARM::VLD4DUPd8Pseudo_UPD:
	case ARM::VLD4DUPd16Pseudo_UPD:
	case ARM::VLD4DUPd32Pseudo_UPD:
	ExpandVLD(MBBI);
	return true;

	case ARM::VST2q8Pseudo:
	case ARM::VST2q16Pseudo:
	case ARM::VST2q32Pseudo:
	case ARM::VST2q8PseudoWB_fixed:
	case ARM::VST2q16PseudoWB_fixed:
	case ARM::VST2q32PseudoWB_fixed:
	case ARM::VST2q8PseudoWB_register:
	case ARM::VST2q16PseudoWB_register:
	case ARM::VST2q32PseudoWB_register:
	case ARM::VST3d8Pseudo:
	case ARM::VST3d16Pseudo:
	case ARM::VST3d32Pseudo:
	case ARM::VST1d64TPseudo:
	case ARM::VST3d8Pseudo_UPD:
	case ARM::VST3d16Pseudo_UPD:
	case ARM::VST3d32Pseudo_UPD:
	case ARM::VST1d64TPseudoWB_fixed:
	case ARM::VST1d64TPseudoWB_register:
	case ARM::VST3q8Pseudo_UPD:
	case ARM::VST3q16Pseudo_UPD:
	case ARM::VST3q32Pseudo_UPD:
	case ARM::VST3q8oddPseudo:
	case ARM::VST3q16oddPseudo:
	case ARM::VST3q32oddPseudo:
	case ARM::VST3q8oddPseudo_UPD:
	case ARM::VST3q16oddPseudo_UPD:
	case ARM::VST3q32oddPseudo_UPD:
	case ARM::VST4d8Pseudo:
	case ARM::VST4d16Pseudo:
	case ARM::VST4d32Pseudo:
	case ARM::VST1d64QPseudo:
	case ARM::VST4d8Pseudo_UPD:
	case ARM::VST4d16Pseudo_UPD:
	case ARM::VST4d32Pseudo_UPD:
	case ARM::VST1d64QPseudoWB_fixed:
	case ARM::VST1d64QPseudoWB_register:
	case ARM::VST4q8Pseudo_UPD:
	case ARM::VST4q16Pseudo_UPD:
	case ARM::VST4q32Pseudo_UPD:
	case ARM::VST4q8oddPseudo:
	case ARM::VST4q16oddPseudo:
	case ARM::VST4q32oddPseudo:
	case ARM::VST4q8oddPseudo_UPD:
	case ARM::VST4q16oddPseudo_UPD:
	case ARM::VST4q32oddPseudo_UPD:
	ExpandVST(MBBI);
	return true;

	case ARM::VLD1LNq8Pseudo:
	case ARM::VLD1LNq16Pseudo:
	case ARM::VLD1LNq32Pseudo:
	case ARM::VLD1LNq8Pseudo_UPD:
	case ARM::VLD1LNq16Pseudo_UPD:
	case ARM::VLD1LNq32Pseudo_UPD:
	case ARM::VLD2LNd8Pseudo:
	case ARM::VLD2LNd16Pseudo:
	case ARM::VLD2LNd32Pseudo:
	case ARM::VLD2LNq16Pseudo:
	case ARM::VLD2LNq32Pseudo:
	case ARM::VLD2LNd8Pseudo_UPD:
	case ARM::VLD2LNd16Pseudo_UPD:
	case ARM::VLD2LNd32Pseudo_UPD:
	case ARM::VLD2LNq16Pseudo_UPD:
	case ARM::VLD2LNq32Pseudo_UPD:
	case ARM::VLD3LNd8Pseudo:
	case ARM::VLD3LNd16Pseudo:
	case ARM::VLD3LNd32Pseudo:
	case ARM::VLD3LNq16Pseudo:
	case ARM::VLD3LNq32Pseudo:
	case ARM::VLD3LNd8Pseudo_UPD:
	case ARM::VLD3LNd16Pseudo_UPD:
	case ARM::VLD3LNd32Pseudo_UPD:
	case ARM::VLD3LNq16Pseudo_UPD:
	case ARM::VLD3LNq32Pseudo_UPD:
	case ARM::VLD4LNd8Pseudo:
	case ARM::VLD4LNd16Pseudo:
	case ARM::VLD4LNd32Pseudo:
	case ARM::VLD4LNq16Pseudo:
	case ARM::VLD4LNq32Pseudo:
	case ARM::VLD4LNd8Pseudo_UPD:
	case ARM::VLD4LNd16Pseudo_UPD:
	case ARM::VLD4LNd32Pseudo_UPD:
	case ARM::VLD4LNq16Pseudo_UPD:
	case ARM::VLD4LNq32Pseudo_UPD:
	case ARM::VST1LNq8Pseudo:
	case ARM::VST1LNq16Pseudo:
	case ARM::VST1LNq32Pseudo:
	case ARM::VST1LNq8Pseudo_UPD:
	case ARM::VST1LNq16Pseudo_UPD:
	case ARM::VST1LNq32Pseudo_UPD:
	case ARM::VST2LNd8Pseudo:
	case ARM::VST2LNd16Pseudo:
	case ARM::VST2LNd32Pseudo:
	case ARM::VST2LNq16Pseudo:
	case ARM::VST2LNq32Pseudo:
	case ARM::VST2LNd8Pseudo_UPD:
	case ARM::VST2LNd16Pseudo_UPD:
	case ARM::VST2LNd32Pseudo_UPD:
	case ARM::VST2LNq16Pseudo_UPD:
	case ARM::VST2LNq32Pseudo_UPD:
	case ARM::VST3LNd8Pseudo:
	case ARM::VST3LNd16Pseudo:
	case ARM::VST3LNd32Pseudo:
	case ARM::VST3LNq16Pseudo:
	case ARM::VST3LNq32Pseudo:
	case ARM::VST3LNd8Pseudo_UPD:
	case ARM::VST3LNd16Pseudo_UPD:
	case ARM::VST3LNd32Pseudo_UPD:
	case ARM::VST3LNq16Pseudo_UPD:
	case ARM::VST3LNq32Pseudo_UPD:
	case ARM::VST4LNd8Pseudo:
	case ARM::VST4LNd16Pseudo:
	case ARM::VST4LNd32Pseudo:
	case ARM::VST4LNq16Pseudo:
	case ARM::VST4LNq32Pseudo:
	case ARM::VST4LNd8Pseudo_UPD:
	case ARM::VST4LNd16Pseudo_UPD:
	case ARM::VST4LNd32Pseudo_UPD:
	case ARM::VST4LNq16Pseudo_UPD:
	case ARM::VST4LNq32Pseudo_UPD:
	ExpandLaneOp(MBBI);
	return true;

	case ARM::VTBL3Pseudo: ExpandVTBL(MBBI, ARM::VTBL3, false); return true;
	case ARM::VTBL4Pseudo: ExpandVTBL(MBBI, ARM::VTBL4, false); return true;
	case ARM::VTBX3Pseudo: ExpandVTBL(MBBI, ARM::VTBX3, true); return true;
	case ARM::VTBX4Pseudo: ExpandVTBL(MBBI, ARM::VTBX4, true); return true;

	case ARM::CMP_SWAP_8:
	if (STI->isThumb())
	return ExpandCMP_SWAP(MBB, MBBI, ARM::t2LDREXB, ARM::t2STREXB,
	ARM::tUXTB, NextMBBI);
	else
	return ExpandCMP_SWAP(MBB, MBBI, ARM::LDREXB, ARM::STREXB,
	ARM::UXTB, NextMBBI);
	case ARM::CMP_SWAP_16:
	if (STI->isThumb())
	return ExpandCMP_SWAP(MBB, MBBI, ARM::t2LDREXH, ARM::t2STREXH,
	ARM::tUXTH, NextMBBI);
	else
	return ExpandCMP_SWAP(MBB, MBBI, ARM::LDREXH, ARM::STREXH,
	ARM::UXTH, NextMBBI);
	case ARM::CMP_SWAP_32:
	if (STI->isThumb())
	return ExpandCMP_SWAP(MBB, MBBI, ARM::t2LDREX, ARM::t2STREX, 0,
	NextMBBI);
	else
	return ExpandCMP_SWAP(MBB, MBBI, ARM::LDREX, ARM::STREX, 0, NextMBBI);

	case ARM::CMP_SWAP_64:
	return ExpandCMP_SWAP_64(MBB, MBBI, NextMBBI);
	}
	}

	bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) {
	bool Modified = false;

	MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
	while (MBBI != E) {
	MachineBasicBlock::iterator NMBBI = std::next(MBBI);
	Modified \|= ExpandMI(MBB, MBBI, NMBBI);
	MBBI = NMBBI;
	}

	return Modified;
	}

	bool ARMExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
	STI = &static_cast<const ARMSubtarget &>(MF.getSubtarget());
	TII = STI->getInstrInfo();
	TRI = STI->getRegisterInfo();
	AFI = MF.getInfo<ARMFunctionInfo>();

	bool Modified = false;
	for (MachineFunction::iterator MFI = MF.begin(), E = MF.end(); MFI != E;
	++MFI)
	Modified \|= ExpandMBB(*MFI);
	if (VerifyARMPseudo)
	MF.verify(this, "After expanding ARM pseudo instructions.");
	return Modified;
	}

	/// createARMExpandPseudoPass - returns an instance of the pseudo instruction
	/// expansion pass.
	FunctionPass *llvm::createARMExpandPseudoPass() {
	return new ARMExpandPseudo();
	}
	diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
	index d06b7d0896f1..7206083a7079 100644
	--- a/lib/Target/ARM/ARMInstrInfo.td
	+++ b/lib/Target/ARM/ARMInstrInfo.td
	@@ -1,6080 +1,6080 @@
	//===- ARMInstrInfo.td - Target Description for ARM Target -- tablegen --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file describes the ARM instructions in TableGen format.
	//
	//===----------------------------------------------------------------------===//

	//===----------------------------------------------------------------------===//
	// ARM specific DAG Nodes.
	//

	// Type profiles.
	def SDT_ARMCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32>,
	SDTCisVT<1, i32> ]>;
	def SDT_ARMCallSeqEnd : SDCallSeqEnd<[ SDTCisVT<0, i32>, SDTCisVT<1, i32> ]>;
	def SDT_ARMStructByVal : SDTypeProfile<0, 4,
	[SDTCisVT<0, i32>, SDTCisVT<1, i32>,
	SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;

	def SDT_ARMSaveCallPC : SDTypeProfile<0, 1, []>;

	def SDT_ARMcall : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;

	def SDT_ARMCMov : SDTypeProfile<1, 3,
	[SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
	SDTCisVT<3, i32>]>;

	def SDT_ARMBrcond : SDTypeProfile<0, 2,
	[SDTCisVT<0, OtherVT>, SDTCisVT<1, i32>]>;

	def SDT_ARMBrJT : SDTypeProfile<0, 2,
	[SDTCisPtrTy<0>, SDTCisVT<1, i32>]>;

	def SDT_ARMBr2JT : SDTypeProfile<0, 3,
	[SDTCisPtrTy<0>, SDTCisVT<1, i32>,
	SDTCisVT<2, i32>]>;

	def SDT_ARMBCC_i64 : SDTypeProfile<0, 6,
	[SDTCisVT<0, i32>,
	SDTCisVT<1, i32>, SDTCisVT<2, i32>,
	SDTCisVT<3, i32>, SDTCisVT<4, i32>,
	SDTCisVT<5, OtherVT>]>;

	def SDT_ARMAnd : SDTypeProfile<1, 2,
	[SDTCisVT<0, i32>, SDTCisVT<1, i32>,
	SDTCisVT<2, i32>]>;

	def SDT_ARMCmp : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>;
	def SDT_ARMFCmp : SDTypeProfile<0, 3, [SDTCisSameAs<0, 1>,
	SDTCisVT<2, i32>]>;

	def SDT_ARMPICAdd : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>,
	SDTCisPtrTy<1>, SDTCisVT<2, i32>]>;

	def SDT_ARMThreadPointer : SDTypeProfile<1, 0, [SDTCisPtrTy<0>]>;
	def SDT_ARMEH_SJLJ_Setjmp : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisPtrTy<1>,
	SDTCisInt<2>]>;
	def SDT_ARMEH_SJLJ_Longjmp: SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisInt<1>]>;
	def SDT_ARMEH_SJLJ_SetupDispatch: SDTypeProfile<0, 0, []>;

	def SDT_ARMMEMBARRIER : SDTypeProfile<0, 1, [SDTCisInt<0>]>;

	def SDT_ARMPREFETCH : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisSameAs<1, 2>,
	SDTCisInt<1>]>;

	def SDT_ARMTCRET : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;

	def SDT_ARMBFI : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
	SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;

	def SDT_WIN__DBZCHK : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;

	def SDT_ARMMEMCPY : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
	SDTCisVT<2, i32>, SDTCisVT<3, i32>,
	SDTCisVT<4, i32>]>;

	def SDTBinaryArithWithFlags : SDTypeProfile<2, 2,
	[SDTCisSameAs<0, 2>,
	SDTCisSameAs<0, 3>,
	SDTCisInt<0>, SDTCisVT<1, i32>]>;

	// SDTBinaryArithWithFlagsInOut - RES1, CPSR = op LHS, RHS, CPSR
	def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3,
	[SDTCisSameAs<0, 2>,
	SDTCisSameAs<0, 3>,
	SDTCisInt<0>,
	SDTCisVT<1, i32>,
	SDTCisVT<4, i32>]>;

	def SDT_LongMac : SDTypeProfile<2, 4, [SDTCisVT<0, i32>,
	SDTCisSameAs<0, 1>,
	SDTCisSameAs<0, 2>,
	SDTCisSameAs<0, 3>,
	SDTCisSameAs<0, 4>,
	SDTCisSameAs<0, 5>]>;

	def ARMSmlald : SDNode<"ARMISD::SMLALD", SDT_LongMac>;
	def ARMSmlaldx : SDNode<"ARMISD::SMLALDX", SDT_LongMac>;
	def ARMSmlsld : SDNode<"ARMISD::SMLSLD", SDT_LongMac>;
	def ARMSmlsldx : SDNode<"ARMISD::SMLSLDX", SDT_LongMac>;

	// Node definitions.
	def ARMWrapper : SDNode<"ARMISD::Wrapper", SDTIntUnaryOp>;
	def ARMWrapperPIC : SDNode<"ARMISD::WrapperPIC", SDTIntUnaryOp>;
	def ARMWrapperJT : SDNode<"ARMISD::WrapperJT", SDTIntUnaryOp>;

	def ARMcallseq_start : SDNode<"ISD::CALLSEQ_START", SDT_ARMCallSeqStart,
	[SDNPHasChain, SDNPSideEffect, SDNPOutGlue]>;
	def ARMcallseq_end : SDNode<"ISD::CALLSEQ_END", SDT_ARMCallSeqEnd,
	[SDNPHasChain, SDNPSideEffect,
	SDNPOptInGlue, SDNPOutGlue]>;
	def ARMcopystructbyval : SDNode<"ARMISD::COPY_STRUCT_BYVAL" ,
	SDT_ARMStructByVal,
	[SDNPHasChain, SDNPInGlue, SDNPOutGlue,
	SDNPMayStore, SDNPMayLoad]>;

	def ARMcall : SDNode<"ARMISD::CALL", SDT_ARMcall,
	[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
	SDNPVariadic]>;
	def ARMcall_pred : SDNode<"ARMISD::CALL_PRED", SDT_ARMcall,
	[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
	SDNPVariadic]>;
	def ARMcall_nolink : SDNode<"ARMISD::CALL_NOLINK", SDT_ARMcall,
	[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
	SDNPVariadic]>;

	def ARMretflag : SDNode<"ARMISD::RET_FLAG", SDTNone,
	[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
	def ARMintretflag : SDNode<"ARMISD::INTRET_FLAG", SDT_ARMcall,
	[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
	def ARMcmov : SDNode<"ARMISD::CMOV", SDT_ARMCMov,
	[SDNPInGlue]>;

	def ARMssatnoshift : SDNode<"ARMISD::SSAT", SDTIntSatNoShOp, []>;

	def ARMbrcond : SDNode<"ARMISD::BRCOND", SDT_ARMBrcond,
	[SDNPHasChain, SDNPInGlue, SDNPOutGlue]>;

	def ARMbrjt : SDNode<"ARMISD::BR_JT", SDT_ARMBrJT,
	[SDNPHasChain]>;
	def ARMbr2jt : SDNode<"ARMISD::BR2_JT", SDT_ARMBr2JT,
	[SDNPHasChain]>;

	def ARMBcci64 : SDNode<"ARMISD::BCC_i64", SDT_ARMBCC_i64,
	[SDNPHasChain]>;

	def ARMcmp : SDNode<"ARMISD::CMP", SDT_ARMCmp,
	[SDNPOutGlue]>;

	def ARMcmn : SDNode<"ARMISD::CMN", SDT_ARMCmp,
	[SDNPOutGlue]>;

	def ARMcmpZ : SDNode<"ARMISD::CMPZ", SDT_ARMCmp,
	[SDNPOutGlue, SDNPCommutative]>;

	def ARMpic_add : SDNode<"ARMISD::PIC_ADD", SDT_ARMPICAdd>;

	def ARMsrl_flag : SDNode<"ARMISD::SRL_FLAG", SDTIntUnaryOp, [SDNPOutGlue]>;
	def ARMsra_flag : SDNode<"ARMISD::SRA_FLAG", SDTIntUnaryOp, [SDNPOutGlue]>;
	def ARMrrx : SDNode<"ARMISD::RRX" , SDTIntUnaryOp, [SDNPInGlue ]>;

	def ARMaddc : SDNode<"ARMISD::ADDC", SDTBinaryArithWithFlags,
	[SDNPCommutative]>;
	def ARMsubc : SDNode<"ARMISD::SUBC", SDTBinaryArithWithFlags>;
	def ARMadde : SDNode<"ARMISD::ADDE", SDTBinaryArithWithFlagsInOut>;
	def ARMsube : SDNode<"ARMISD::SUBE", SDTBinaryArithWithFlagsInOut>;

	def ARMthread_pointer: SDNode<"ARMISD::THREAD_POINTER", SDT_ARMThreadPointer>;
	def ARMeh_sjlj_setjmp: SDNode<"ARMISD::EH_SJLJ_SETJMP",
	SDT_ARMEH_SJLJ_Setjmp,
	[SDNPHasChain, SDNPSideEffect]>;
	def ARMeh_sjlj_longjmp: SDNode<"ARMISD::EH_SJLJ_LONGJMP",
	SDT_ARMEH_SJLJ_Longjmp,
	[SDNPHasChain, SDNPSideEffect]>;
	def ARMeh_sjlj_setup_dispatch: SDNode<"ARMISD::EH_SJLJ_SETUP_DISPATCH",
	SDT_ARMEH_SJLJ_SetupDispatch,
	[SDNPHasChain, SDNPSideEffect]>;

	def ARMMemBarrierMCR : SDNode<"ARMISD::MEMBARRIER_MCR", SDT_ARMMEMBARRIER,
	[SDNPHasChain, SDNPSideEffect]>;
	def ARMPreload : SDNode<"ARMISD::PRELOAD", SDT_ARMPREFETCH,
	[SDNPHasChain, SDNPMayLoad, SDNPMayStore]>;

	def ARMtcret : SDNode<"ARMISD::TC_RETURN", SDT_ARMTCRET,
	[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;

	def ARMbfi : SDNode<"ARMISD::BFI", SDT_ARMBFI>;

	def ARMmemcopy : SDNode<"ARMISD::MEMCPY", SDT_ARMMEMCPY,
	[SDNPHasChain, SDNPInGlue, SDNPOutGlue,
	SDNPMayStore, SDNPMayLoad]>;

	def ARMsmulwb : SDNode<"ARMISD::SMULWB", SDTIntBinOp, []>;
	def ARMsmulwt : SDNode<"ARMISD::SMULWT", SDTIntBinOp, []>;
	def ARMsmlalbb : SDNode<"ARMISD::SMLALBB", SDT_LongMac, []>;
	def ARMsmlalbt : SDNode<"ARMISD::SMLALBT", SDT_LongMac, []>;
	def ARMsmlaltb : SDNode<"ARMISD::SMLALTB", SDT_LongMac, []>;
	def ARMsmlaltt : SDNode<"ARMISD::SMLALTT", SDT_LongMac, []>;

	//===----------------------------------------------------------------------===//
	// ARM Instruction Predicate Definitions.
	//
	def HasV4T : Predicate<"Subtarget->hasV4TOps()">,
	AssemblerPredicate<"HasV4TOps", "armv4t">;
	def NoV4T : Predicate<"!Subtarget->hasV4TOps()">;
	def HasV5T : Predicate<"Subtarget->hasV5TOps()">,
	AssemblerPredicate<"HasV5TOps", "armv5t">;
	def HasV5TE : Predicate<"Subtarget->hasV5TEOps()">,
	AssemblerPredicate<"HasV5TEOps", "armv5te">;
	def HasV6 : Predicate<"Subtarget->hasV6Ops()">,
	AssemblerPredicate<"HasV6Ops", "armv6">;
	def NoV6 : Predicate<"!Subtarget->hasV6Ops()">;
	def HasV6M : Predicate<"Subtarget->hasV6MOps()">,
	AssemblerPredicate<"HasV6MOps",
	"armv6m or armv6t2">;
	def HasV8MBaseline : Predicate<"Subtarget->hasV8MBaselineOps()">,
	AssemblerPredicate<"HasV8MBaselineOps",
	"armv8m.base">;
	def HasV8MMainline : Predicate<"Subtarget->hasV8MMainlineOps()">,
	AssemblerPredicate<"HasV8MMainlineOps",
	"armv8m.main">;
	def HasV6T2 : Predicate<"Subtarget->hasV6T2Ops()">,
	AssemblerPredicate<"HasV6T2Ops", "armv6t2">;
	def NoV6T2 : Predicate<"!Subtarget->hasV6T2Ops()">;
	def HasV6K : Predicate<"Subtarget->hasV6KOps()">,
	AssemblerPredicate<"HasV6KOps", "armv6k">;
	def NoV6K : Predicate<"!Subtarget->hasV6KOps()">;
	def HasV7 : Predicate<"Subtarget->hasV7Ops()">,
	AssemblerPredicate<"HasV7Ops", "armv7">;
	def HasV8 : Predicate<"Subtarget->hasV8Ops()">,
	AssemblerPredicate<"HasV8Ops", "armv8">;
	def PreV8 : Predicate<"!Subtarget->hasV8Ops()">,
	AssemblerPredicate<"!HasV8Ops", "armv7 or earlier">;
	def HasV8_1a : Predicate<"Subtarget->hasV8_1aOps()">,
	AssemblerPredicate<"HasV8_1aOps", "armv8.1a">;
	def HasV8_2a : Predicate<"Subtarget->hasV8_2aOps()">,
	AssemblerPredicate<"HasV8_2aOps", "armv8.2a">;
	def NoVFP : Predicate<"!Subtarget->hasVFP2()">;
	def HasVFP2 : Predicate<"Subtarget->hasVFP2()">,
	AssemblerPredicate<"FeatureVFP2", "VFP2">;
	def HasVFP3 : Predicate<"Subtarget->hasVFP3()">,
	AssemblerPredicate<"FeatureVFP3", "VFP3">;
	def HasVFP4 : Predicate<"Subtarget->hasVFP4()">,
	AssemblerPredicate<"FeatureVFP4", "VFP4">;
	def HasDPVFP : Predicate<"!Subtarget->isFPOnlySP()">,
	AssemblerPredicate<"!FeatureVFPOnlySP",
	"double precision VFP">;
	def HasFPARMv8 : Predicate<"Subtarget->hasFPARMv8()">,
	AssemblerPredicate<"FeatureFPARMv8", "FPARMv8">;
	def HasNEON : Predicate<"Subtarget->hasNEON()">,
	AssemblerPredicate<"FeatureNEON", "NEON">;
	def HasCrypto : Predicate<"Subtarget->hasCrypto()">,
	AssemblerPredicate<"FeatureCrypto", "crypto">;
	def HasCRC : Predicate<"Subtarget->hasCRC()">,
	AssemblerPredicate<"FeatureCRC", "crc">;
	def HasRAS : Predicate<"Subtarget->hasRAS()">,
	AssemblerPredicate<"FeatureRAS", "ras">;
	def HasFP16 : Predicate<"Subtarget->hasFP16()">,
	AssemblerPredicate<"FeatureFP16","half-float conversions">;
	def HasFullFP16 : Predicate<"Subtarget->hasFullFP16()">,
	AssemblerPredicate<"FeatureFullFP16","full half-float">;
	def HasDivideInThumb : Predicate<"Subtarget->hasDivideInThumbMode()">,
	AssemblerPredicate<"FeatureHWDivThumb", "divide in THUMB">;
	def HasDivideInARM : Predicate<"Subtarget->hasDivideInARMMode()">,
	AssemblerPredicate<"FeatureHWDivARM", "divide in ARM">;
	def HasDSP : Predicate<"Subtarget->hasDSP()">,
	AssemblerPredicate<"FeatureDSP", "dsp">;
	def HasDB : Predicate<"Subtarget->hasDataBarrier()">,
	AssemblerPredicate<"FeatureDB",
	"data-barriers">;
	def HasV7Clrex : Predicate<"Subtarget->hasV7Clrex()">,
	AssemblerPredicate<"FeatureV7Clrex",
	"v7 clrex">;
	def HasAcquireRelease : Predicate<"Subtarget->hasAcquireRelease()">,
	AssemblerPredicate<"FeatureAcquireRelease",
	"acquire/release">;
	def HasMP : Predicate<"Subtarget->hasMPExtension()">,
	AssemblerPredicate<"FeatureMP",
	"mp-extensions">;
	def HasVirtualization: Predicate<"false">,
	AssemblerPredicate<"FeatureVirtualization",
	"virtualization-extensions">;
	def HasTrustZone : Predicate<"Subtarget->hasTrustZone()">,
	AssemblerPredicate<"FeatureTrustZone",
	"TrustZone">;
	def Has8MSecExt : Predicate<"Subtarget->has8MSecExt()">,
	AssemblerPredicate<"Feature8MSecExt",
	"ARMv8-M Security Extensions">;
	def HasZCZ : Predicate<"Subtarget->hasZeroCycleZeroing()">;
	def UseNEONForFP : Predicate<"Subtarget->useNEONForSinglePrecisionFP()">;
	def DontUseNEONForFP : Predicate<"!Subtarget->useNEONForSinglePrecisionFP()">;
	def IsThumb : Predicate<"Subtarget->isThumb()">,
	AssemblerPredicate<"ModeThumb", "thumb">;
	def IsThumb1Only : Predicate<"Subtarget->isThumb1Only()">;
	def IsThumb2 : Predicate<"Subtarget->isThumb2()">,
	AssemblerPredicate<"ModeThumb,FeatureThumb2",
	"thumb2">;
	def IsMClass : Predicate<"Subtarget->isMClass()">,
	AssemblerPredicate<"FeatureMClass", "armv*m">;
	def IsNotMClass : Predicate<"!Subtarget->isMClass()">,
	AssemblerPredicate<"!FeatureMClass",
	"!armv*m">;
	def IsARM : Predicate<"!Subtarget->isThumb()">,
	AssemblerPredicate<"!ModeThumb", "arm-mode">;
	def IsMachO : Predicate<"Subtarget->isTargetMachO()">;
	def IsNotMachO : Predicate<"!Subtarget->isTargetMachO()">;
	def IsNaCl : Predicate<"Subtarget->isTargetNaCl()">;
	def IsWindows : Predicate<"Subtarget->isTargetWindows()">;
	def IsNotWindows : Predicate<"!Subtarget->isTargetWindows()">;
	def UseNaClTrap : Predicate<"Subtarget->useNaClTrap()">,
	AssemblerPredicate<"FeatureNaClTrap", "NaCl">;
	def DontUseNaClTrap : Predicate<"!Subtarget->useNaClTrap()">;

	def UseNegativeImmediates :
	Predicate<"false">,
	AssemblerPredicate<"!FeatureNoNegativeImmediates",
	"NegativeImmediates">;

	// FIXME: Eventually this will be just "hasV6T2Ops".
	let RecomputePerFunction = 1 in {
	def UseMovt : Predicate<"Subtarget->useMovt(*MF)">;
	def DontUseMovt : Predicate<"!Subtarget->useMovt(*MF)">;
	}
	def UseFPVMLx : Predicate<"Subtarget->useFPVMLx()">;
	def UseMulOps : Predicate<"Subtarget->useMulOps()">;

	// Prefer fused MAC for fp mul + add over fp VMLA / VMLS if they are available.
	// But only select them if more precision in FP computation is allowed.
	// Do not use them for Darwin platforms.
	def UseFusedMAC : Predicate<"(TM.Options.AllowFPOpFusion =="
	" FPOpFusion::Fast && "
	" Subtarget->hasVFP4()) && "
	"!Subtarget->isTargetDarwin()">;
	def DontUseFusedMAC : Predicate<"!(TM.Options.AllowFPOpFusion =="
	" FPOpFusion::Fast &&"
	" Subtarget->hasVFP4()) \|\| "
	"Subtarget->isTargetDarwin()">;

	def HasFastVGETLNi32 : Predicate<"!Subtarget->hasSlowVGETLNi32()">;
	def HasSlowVGETLNi32 : Predicate<"Subtarget->hasSlowVGETLNi32()">;

	def HasFastVDUP32 : Predicate<"!Subtarget->hasSlowVDUP32()">;
	def HasSlowVDUP32 : Predicate<"Subtarget->hasSlowVDUP32()">;

	def UseVMOVSR : Predicate<"Subtarget->preferVMOVSR() \|\|"
	"!Subtarget->useNEONForSinglePrecisionFP()">;
	def DontUseVMOVSR : Predicate<"!Subtarget->preferVMOVSR() &&"
	"Subtarget->useNEONForSinglePrecisionFP()">;

	let RecomputePerFunction = 1 in {
	def IsLE : Predicate<"MF->getDataLayout().isLittleEndian()">;
	def IsBE : Predicate<"MF->getDataLayout().isBigEndian()">;
	}

	def GenExecuteOnly : Predicate<"Subtarget->genExecuteOnly()">;

	//===----------------------------------------------------------------------===//
	// ARM Flag Definitions.

	class RegConstraint<string C> {
	string Constraints = C;
	}

	//===----------------------------------------------------------------------===//
	// ARM specific transformation functions and pattern fragments.
	//

	// imm_neg_XFORM - Return the negation of an i32 immediate value.
	def imm_neg_XFORM : SDNodeXForm<imm, [{
	return CurDAG->getTargetConstant(-(int)N->getZExtValue(), SDLoc(N), MVT::i32);
	}]>;

	// imm_not_XFORM - Return the complement of a i32 immediate value.
	def imm_not_XFORM : SDNodeXForm<imm, [{
	return CurDAG->getTargetConstant(~(int)N->getZExtValue(), SDLoc(N), MVT::i32);
	}]>;

	/// imm16_31 predicate - True if the 32-bit immediate is in the range [16,31].
	def imm16_31 : ImmLeaf<i32, [{
	return (int32_t)Imm >= 16 && (int32_t)Imm < 32;
	}]>;

	// sext_16_node predicate - True if the SDNode is sign-extended 16 or more bits.
	def sext_16_node : PatLeaf<(i32 GPR:$a), [{
	if (CurDAG->ComputeNumSignBits(SDValue(N,0)) >= 17)
	return true;

	if (N->getOpcode() != ISD::SRA)
	return false;
	if (N->getOperand(0).getOpcode() != ISD::SHL)
	return false;

	auto *ShiftVal = dyn_cast<ConstantSDNode>(N->getOperand(1));
	if (!ShiftVal \|\| ShiftVal->getZExtValue() != 16)
	return false;

	ShiftVal = dyn_cast<ConstantSDNode>(N->getOperand(0)->getOperand(1));
	if (!ShiftVal \|\| ShiftVal->getZExtValue() != 16)
	return false;

	return true;
	}]>;

	/// Split a 32-bit immediate into two 16 bit parts.
	def hi16 : SDNodeXForm<imm, [{
	return CurDAG->getTargetConstant((uint32_t)N->getZExtValue() >> 16, SDLoc(N),
	MVT::i32);
	}]>;

	def lo16AllZero : PatLeaf<(i32 imm), [{
	// Returns true if all low 16-bits are 0.
	return (((uint32_t)N->getZExtValue()) & 0xFFFFUL) == 0;
	}], hi16>;

	class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>;
	class UnOpFrag <dag res> : PatFrag<(ops node:$Src), res>;

	// An 'and' node with a single use.
	def and_su : PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs), [{
	return N->hasOneUse();
	}]>;

	// An 'xor' node with a single use.
	def xor_su : PatFrag<(ops node:$lhs, node:$rhs), (xor node:$lhs, node:$rhs), [{
	return N->hasOneUse();
	}]>;

	// An 'fmul' node with a single use.
	def fmul_su : PatFrag<(ops node:$lhs, node:$rhs), (fmul node:$lhs, node:$rhs),[{
	return N->hasOneUse();
	}]>;

	// An 'fadd' node which checks for single non-hazardous use.
	def fadd_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fadd node:$lhs, node:$rhs),[{
	return hasNoVMLxHazardUse(N);
	}]>;

	// An 'fsub' node which checks for single non-hazardous use.
	def fsub_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fsub node:$lhs, node:$rhs),[{
	return hasNoVMLxHazardUse(N);
	}]>;

	//===----------------------------------------------------------------------===//
	// Operand Definitions.
	//

	// Immediate operands with a shared generic asm render method.
	class ImmAsmOperand<int Low, int High> : AsmOperandClass {
	let RenderMethod = "addImmOperands";
	let PredicateMethod = "isImmediate<" # Low # "," # High # ">";
	let DiagnosticType = "ImmRange" # Low # "_" # High;
	}

	class ImmAsmOperandMinusOne<int Low, int High> : AsmOperandClass {
	let PredicateMethod = "isImmediate<" # Low # "," # High # ">";
	let DiagnosticType = "ImmRange" # Low # "_" # High;
	}

	// Operands that are part of a memory addressing mode.
	class MemOperand : Operand<i32> { let OperandType = "OPERAND_MEMORY"; }

	// Branch target.
	// FIXME: rename brtarget to t2_brtarget
	def brtarget : Operand<OtherVT> {
	let EncoderMethod = "getBranchTargetOpValue";
	let OperandType = "OPERAND_PCREL";
	let DecoderMethod = "DecodeT2BROperand";
	}

	// Branches targeting ARM-mode must be divisible by 4 if they're a raw
	// immediate.
	def ARMBranchTarget : AsmOperandClass {
	let Name = "ARMBranchTarget";
	}

	// Branches targeting Thumb-mode must be divisible by 2 if they're a raw
	// immediate.
	def ThumbBranchTarget : AsmOperandClass {
	let Name = "ThumbBranchTarget";
	}

	def arm_br_target : Operand<OtherVT> {
	let ParserMatchClass = ARMBranchTarget;
	let EncoderMethod = "getARMBranchTargetOpValue";
	let OperandType = "OPERAND_PCREL";
	}

	// Call target for ARM. Handles conditional/unconditional
	// FIXME: rename bl_target to t2_bltarget?
	def arm_bl_target : Operand<i32> {
	let ParserMatchClass = ARMBranchTarget;
	let EncoderMethod = "getARMBLTargetOpValue";
	let OperandType = "OPERAND_PCREL";
	}

	// Target for BLX from ARM mode.
	def arm_blx_target : Operand<i32> {
	let ParserMatchClass = ThumbBranchTarget;
	let EncoderMethod = "getARMBLXTargetOpValue";
	let OperandType = "OPERAND_PCREL";
	}

	// A list of registers separated by comma. Used by load/store multiple.
	def RegListAsmOperand : AsmOperandClass { let Name = "RegList"; }
	def reglist : Operand<i32> {
	let EncoderMethod = "getRegisterListOpValue";
	let ParserMatchClass = RegListAsmOperand;
	let PrintMethod = "printRegisterList";
	let DecoderMethod = "DecodeRegListOperand";
	}

	def GPRPairOp : RegisterOperand<GPRPair, "printGPRPairOperand">;

	def DPRRegListAsmOperand : AsmOperandClass { let Name = "DPRRegList"; }
	def dpr_reglist : Operand<i32> {
	let EncoderMethod = "getRegisterListOpValue";
	let ParserMatchClass = DPRRegListAsmOperand;
	let PrintMethod = "printRegisterList";
	let DecoderMethod = "DecodeDPRRegListOperand";
	}

	def SPRRegListAsmOperand : AsmOperandClass { let Name = "SPRRegList"; }
	def spr_reglist : Operand<i32> {
	let EncoderMethod = "getRegisterListOpValue";
	let ParserMatchClass = SPRRegListAsmOperand;
	let PrintMethod = "printRegisterList";
	let DecoderMethod = "DecodeSPRRegListOperand";
	}

	// An operand for the CONSTPOOL_ENTRY pseudo-instruction.
	def cpinst_operand : Operand<i32> {
	let PrintMethod = "printCPInstOperand";
	}

	// Local PC labels.
	def pclabel : Operand<i32> {
	let PrintMethod = "printPCLabel";
	}

	// ADR instruction labels.
	def AdrLabelAsmOperand : AsmOperandClass { let Name = "AdrLabel"; }
	def adrlabel : Operand<i32> {
	let EncoderMethod = "getAdrLabelOpValue";
	let ParserMatchClass = AdrLabelAsmOperand;
	let PrintMethod = "printAdrLabelOperand<0>";
	}

	def neon_vcvt_imm32 : Operand<i32> {
	let EncoderMethod = "getNEONVcvtImm32OpValue";
	let DecoderMethod = "DecodeVCVTImmOperand";
	}

	// rot_imm: An integer that encodes a rotate amount. Must be 8, 16, or 24.
	def rot_imm_XFORM: SDNodeXForm<imm, [{
	switch (N->getZExtValue()){
	default: llvm_unreachable(nullptr);
	case 0: return CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32);
	case 8: return CurDAG->getTargetConstant(1, SDLoc(N), MVT::i32);
	case 16: return CurDAG->getTargetConstant(2, SDLoc(N), MVT::i32);
	case 24: return CurDAG->getTargetConstant(3, SDLoc(N), MVT::i32);
	}
	}]>;
	def RotImmAsmOperand : AsmOperandClass {
	let Name = "RotImm";
	let ParserMethod = "parseRotImm";
	}
	def rot_imm : Operand<i32>, PatLeaf<(i32 imm), [{
	int32_t v = N->getZExtValue();
	return v == 8 \|\| v == 16 \|\| v == 24; }],
	rot_imm_XFORM> {
	let PrintMethod = "printRotImmOperand";
	let ParserMatchClass = RotImmAsmOperand;
	}

	// shift_imm: An integer that encodes a shift amount and the type of shift
	// (asr or lsl). The 6-bit immediate encodes as:
	// {5} 0 ==> lsl
	// 1 asr
	// {4-0} imm5 shift amount.
	// asr #32 encoded as imm5 == 0.
	def ShifterImmAsmOperand : AsmOperandClass {
	let Name = "ShifterImm";
	let ParserMethod = "parseShifterImm";
	}
	def shift_imm : Operand<i32> {
	let PrintMethod = "printShiftImmOperand";
	let ParserMatchClass = ShifterImmAsmOperand;
	}

	// shifter_operand operands: so_reg_reg, so_reg_imm, and mod_imm.
	def ShiftedRegAsmOperand : AsmOperandClass { let Name = "RegShiftedReg"; }
	def so_reg_reg : Operand<i32>, // reg reg imm
	ComplexPattern<i32, 3, "SelectRegShifterOperand",
	[shl, srl, sra, rotr]> {
	let EncoderMethod = "getSORegRegOpValue";
	let PrintMethod = "printSORegRegOperand";
	let DecoderMethod = "DecodeSORegRegOperand";
	let ParserMatchClass = ShiftedRegAsmOperand;
	let MIOperandInfo = (ops GPRnopc, GPRnopc, i32imm);
	}

	def ShiftedImmAsmOperand : AsmOperandClass { let Name = "RegShiftedImm"; }
	def so_reg_imm : Operand<i32>, // reg imm
	ComplexPattern<i32, 2, "SelectImmShifterOperand",
	[shl, srl, sra, rotr]> {
	let EncoderMethod = "getSORegImmOpValue";
	let PrintMethod = "printSORegImmOperand";
	let DecoderMethod = "DecodeSORegImmOperand";
	let ParserMatchClass = ShiftedImmAsmOperand;
	let MIOperandInfo = (ops GPR, i32imm);
	}

	// FIXME: Does this need to be distinct from so_reg?
	def shift_so_reg_reg : Operand<i32>, // reg reg imm
	ComplexPattern<i32, 3, "SelectShiftRegShifterOperand",
	[shl,srl,sra,rotr]> {
	let EncoderMethod = "getSORegRegOpValue";
	let PrintMethod = "printSORegRegOperand";
	let DecoderMethod = "DecodeSORegRegOperand";
	let ParserMatchClass = ShiftedRegAsmOperand;
	let MIOperandInfo = (ops GPR, GPR, i32imm);
	}

	// FIXME: Does this need to be distinct from so_reg?
	def shift_so_reg_imm : Operand<i32>, // reg reg imm
	ComplexPattern<i32, 2, "SelectShiftImmShifterOperand",
	[shl,srl,sra,rotr]> {
	let EncoderMethod = "getSORegImmOpValue";
	let PrintMethod = "printSORegImmOperand";
	let DecoderMethod = "DecodeSORegImmOperand";
	let ParserMatchClass = ShiftedImmAsmOperand;
	let MIOperandInfo = (ops GPR, i32imm);
	}

	// mod_imm: match a 32-bit immediate operand, which can be encoded into
	// a 12-bit immediate; an 8-bit integer and a 4-bit rotator (See ARMARM
	// - "Modified Immediate Constants"). Within the MC layer we keep this
	// immediate in its encoded form.
	def ModImmAsmOperand: AsmOperandClass {
	let Name = "ModImm";
	let ParserMethod = "parseModImm";
	}
	def mod_imm : Operand<i32>, ImmLeaf<i32, [{
	return ARM_AM::getSOImmVal(Imm) != -1;
	}]> {
	let EncoderMethod = "getModImmOpValue";
	let PrintMethod = "printModImmOperand";
	let ParserMatchClass = ModImmAsmOperand;
	}

	// Note: the patterns mod_imm_not and mod_imm_neg do not require an encoder
	// method and such, as they are only used on aliases (Pat<> and InstAlias<>).
	// The actual parsing, encoding, decoding are handled by the destination
	// instructions, which use mod_imm.

	def ModImmNotAsmOperand : AsmOperandClass { let Name = "ModImmNot"; }
	def mod_imm_not : Operand<i32>, PatLeaf<(imm), [{
	return ARM_AM::getSOImmVal(~(uint32_t)N->getZExtValue()) != -1;
	}], imm_not_XFORM> {
	let ParserMatchClass = ModImmNotAsmOperand;
	}

	def ModImmNegAsmOperand : AsmOperandClass { let Name = "ModImmNeg"; }
	def mod_imm_neg : Operand<i32>, PatLeaf<(imm), [{
	unsigned Value = -(unsigned)N->getZExtValue();
	return Value && ARM_AM::getSOImmVal(Value) != -1;
	}], imm_neg_XFORM> {
	let ParserMatchClass = ModImmNegAsmOperand;
	}

	/// arm_i32imm - True for +V6T2, or when isSOImmTwoParVal()
	def arm_i32imm : PatLeaf<(imm), [{
	if (Subtarget->useMovt(*MF))
	return true;
	return ARM_AM::isSOImmTwoPartVal((unsigned)N->getZExtValue());
	}]>;

	/// imm0_1 predicate - Immediate in the range [0,1].
	def Imm0_1AsmOperand: ImmAsmOperand<0,1> { let Name = "Imm0_1"; }
	def imm0_1 : Operand<i32> { let ParserMatchClass = Imm0_1AsmOperand; }

	/// imm0_3 predicate - Immediate in the range [0,3].
	def Imm0_3AsmOperand: ImmAsmOperand<0,3> { let Name = "Imm0_3"; }
	def imm0_3 : Operand<i32> { let ParserMatchClass = Imm0_3AsmOperand; }

	/// imm0_7 predicate - Immediate in the range [0,7].
	def Imm0_7AsmOperand: ImmAsmOperand<0,7> {
	let Name = "Imm0_7";
	}
	def imm0_7 : Operand<i32>, ImmLeaf<i32, [{
	return Imm >= 0 && Imm < 8;
	}]> {
	let ParserMatchClass = Imm0_7AsmOperand;
	}

	/// imm8_255 predicate - Immediate in the range [8,255].
	def Imm8_255AsmOperand: ImmAsmOperand<8,255> { let Name = "Imm8_255"; }
	def imm8_255 : Operand<i32>, ImmLeaf<i32, [{
	return Imm >= 8 && Imm < 256;
	}]> {
	let ParserMatchClass = Imm8_255AsmOperand;
	}

	/// imm8 predicate - Immediate is exactly 8.
	def Imm8AsmOperand: ImmAsmOperand<8,8> { let Name = "Imm8"; }
	def imm8 : Operand<i32>, ImmLeaf<i32, [{ return Imm == 8; }]> {
	let ParserMatchClass = Imm8AsmOperand;
	}

	/// imm16 predicate - Immediate is exactly 16.
	def Imm16AsmOperand: ImmAsmOperand<16,16> { let Name = "Imm16"; }
	def imm16 : Operand<i32>, ImmLeaf<i32, [{ return Imm == 16; }]> {
	let ParserMatchClass = Imm16AsmOperand;
	}

	/// imm32 predicate - Immediate is exactly 32.
	def Imm32AsmOperand: ImmAsmOperand<32,32> { let Name = "Imm32"; }
	def imm32 : Operand<i32>, ImmLeaf<i32, [{ return Imm == 32; }]> {
	let ParserMatchClass = Imm32AsmOperand;
	}

	def imm8_or_16 : ImmLeaf<i32, [{ return Imm == 8 \|\| Imm == 16;}]>;

	/// imm1_7 predicate - Immediate in the range [1,7].
	def Imm1_7AsmOperand: ImmAsmOperand<1,7> { let Name = "Imm1_7"; }
	def imm1_7 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm < 8; }]> {
	let ParserMatchClass = Imm1_7AsmOperand;
	}

	/// imm1_15 predicate - Immediate in the range [1,15].
	def Imm1_15AsmOperand: ImmAsmOperand<1,15> { let Name = "Imm1_15"; }
	def imm1_15 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm < 16; }]> {
	let ParserMatchClass = Imm1_15AsmOperand;
	}

	/// imm1_31 predicate - Immediate in the range [1,31].
	def Imm1_31AsmOperand: ImmAsmOperand<1,31> { let Name = "Imm1_31"; }
	def imm1_31 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm < 32; }]> {
	let ParserMatchClass = Imm1_31AsmOperand;
	}

	/// imm0_15 predicate - Immediate in the range [0,15].
	def Imm0_15AsmOperand: ImmAsmOperand<0,15> {
	let Name = "Imm0_15";
	let DiagnosticType = "ImmRange0_15";
	}
	def imm0_15 : Operand<i32>, ImmLeaf<i32, [{
	return Imm >= 0 && Imm < 16;
	}]> {
	let ParserMatchClass = Imm0_15AsmOperand;
	}

	/// imm0_31 predicate - True if the 32-bit immediate is in the range [0,31].
	def Imm0_31AsmOperand: ImmAsmOperand<0,31> { let Name = "Imm0_31"; }
	def imm0_31 : Operand<i32>, ImmLeaf<i32, [{
	return Imm >= 0 && Imm < 32;
	}]> {
	let ParserMatchClass = Imm0_31AsmOperand;
	}

	/// imm0_32 predicate - True if the 32-bit immediate is in the range [0,32].
	def Imm0_32AsmOperand: ImmAsmOperand<0,32> { let Name = "Imm0_32"; }
	def imm0_32 : Operand<i32>, ImmLeaf<i32, [{
	return Imm >= 0 && Imm < 33;
	}]> {
	let ParserMatchClass = Imm0_32AsmOperand;
	}

	/// imm0_63 predicate - True if the 32-bit immediate is in the range [0,63].
	def Imm0_63AsmOperand: ImmAsmOperand<0,63> { let Name = "Imm0_63"; }
	def imm0_63 : Operand<i32>, ImmLeaf<i32, [{
	return Imm >= 0 && Imm < 64;
	}]> {
	let ParserMatchClass = Imm0_63AsmOperand;
	}

	/// imm0_239 predicate - Immediate in the range [0,239].
	def Imm0_239AsmOperand : ImmAsmOperand<0,239> {
	let Name = "Imm0_239";
	let DiagnosticType = "ImmRange0_239";
	}
	def imm0_239 : Operand<i32>, ImmLeaf<i32, [{ return Imm >= 0 && Imm < 240; }]> {
	let ParserMatchClass = Imm0_239AsmOperand;
	}

	/// imm0_255 predicate - Immediate in the range [0,255].
	def Imm0_255AsmOperand : ImmAsmOperand<0,255> { let Name = "Imm0_255"; }
	def imm0_255 : Operand<i32>, ImmLeaf<i32, [{ return Imm >= 0 && Imm < 256; }]> {
	let ParserMatchClass = Imm0_255AsmOperand;
	}

	/// imm0_65535 - An immediate is in the range [0,65535].
	def Imm0_65535AsmOperand: ImmAsmOperand<0,65535> { let Name = "Imm0_65535"; }
	def imm0_65535 : Operand<i32>, ImmLeaf<i32, [{
	return Imm >= 0 && Imm < 65536;
	}]> {
	let ParserMatchClass = Imm0_65535AsmOperand;
	}

	// imm0_65535_neg - An immediate whose negative value is in the range [0.65535].
	def imm0_65535_neg : Operand<i32>, ImmLeaf<i32, [{
	return -Imm >= 0 && -Imm < 65536;
	}]>;

	// imm0_65535_expr - For movt/movw - 16-bit immediate that can also reference
	// a relocatable expression.
	//
	// FIXME: This really needs a Thumb version separate from the ARM version.
	// While the range is the same, and can thus use the same match class,
	// the encoding is different so it should have a different encoder method.
	def Imm0_65535ExprAsmOperand: AsmOperandClass {
	let Name = "Imm0_65535Expr";
	let RenderMethod = "addImmOperands";
	}

	def imm0_65535_expr : Operand<i32> {
	let EncoderMethod = "getHiLo16ImmOpValue";
	let ParserMatchClass = Imm0_65535ExprAsmOperand;
	}

	def Imm256_65535ExprAsmOperand: ImmAsmOperand<256,65535> { let Name = "Imm256_65535Expr"; }
	def imm256_65535_expr : Operand<i32> {
	let ParserMatchClass = Imm256_65535ExprAsmOperand;
	}

	/// imm24b - True if the 32-bit immediate is encodable in 24 bits.
	def Imm24bitAsmOperand: ImmAsmOperand<0,0xffffff> { let Name = "Imm24bit"; }
	def imm24b : Operand<i32>, ImmLeaf<i32, [{
	return Imm >= 0 && Imm <= 0xffffff;
	}]> {
	let ParserMatchClass = Imm24bitAsmOperand;
	}


	/// bf_inv_mask_imm predicate - An AND mask to clear an arbitrary width bitfield
	/// e.g., 0xf000ffff
	def BitfieldAsmOperand : AsmOperandClass {
	let Name = "Bitfield";
	let ParserMethod = "parseBitfield";
	}

	def bf_inv_mask_imm : Operand<i32>,
	PatLeaf<(imm), [{
	return ARM::isBitFieldInvertedMask(N->getZExtValue());
	}] > {
	let EncoderMethod = "getBitfieldInvertedMaskOpValue";
	let PrintMethod = "printBitfieldInvMaskImmOperand";
	let DecoderMethod = "DecodeBitfieldMaskOperand";
	let ParserMatchClass = BitfieldAsmOperand;
	}

	def imm1_32_XFORM: SDNodeXForm<imm, [{
	return CurDAG->getTargetConstant((int)N->getZExtValue() - 1, SDLoc(N),
	MVT::i32);
	}]>;
	def Imm1_32AsmOperand: ImmAsmOperandMinusOne<1,32> {
	let Name = "Imm1_32";
	}
	def imm1_32 : Operand<i32>, PatLeaf<(imm), [{
	uint64_t Imm = N->getZExtValue();
	return Imm > 0 && Imm <= 32;
	}],
	imm1_32_XFORM> {
	let PrintMethod = "printImmPlusOneOperand";
	let ParserMatchClass = Imm1_32AsmOperand;
	}

	def imm1_16_XFORM: SDNodeXForm<imm, [{
	return CurDAG->getTargetConstant((int)N->getZExtValue() - 1, SDLoc(N),
	MVT::i32);
	}]>;
	def Imm1_16AsmOperand: ImmAsmOperandMinusOne<1,16> { let Name = "Imm1_16"; }
	def imm1_16 : Operand<i32>, ImmLeaf<i32, [{
	return Imm > 0 && Imm <= 16;
	}],
	imm1_16_XFORM> {
	let PrintMethod = "printImmPlusOneOperand";
	let ParserMatchClass = Imm1_16AsmOperand;
	}

	// Define ARM specific addressing modes.
	// addrmode_imm12 := reg +/- imm12
	//
	def MemImm12OffsetAsmOperand : AsmOperandClass { let Name = "MemImm12Offset"; }
	class AddrMode_Imm12 : MemOperand,
	ComplexPattern<i32, 2, "SelectAddrModeImm12", []> {
	// 12-bit immediate operand. Note that instructions using this encode
	// #0 and #-0 differently. We flag #-0 as the magic value INT32_MIN. All other
	// immediate values are as normal.

	let EncoderMethod = "getAddrModeImm12OpValue";
	let DecoderMethod = "DecodeAddrModeImm12Operand";
	let ParserMatchClass = MemImm12OffsetAsmOperand;
	let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
	}

	def addrmode_imm12 : AddrMode_Imm12 {
	let PrintMethod = "printAddrModeImm12Operand<false>";
	}

	def addrmode_imm12_pre : AddrMode_Imm12 {
	let PrintMethod = "printAddrModeImm12Operand<true>";
	}

	// ldst_so_reg := reg +/- reg shop imm
	//
	def MemRegOffsetAsmOperand : AsmOperandClass { let Name = "MemRegOffset"; }
	def ldst_so_reg : MemOperand,
	ComplexPattern<i32, 3, "SelectLdStSOReg", []> {
	let EncoderMethod = "getLdStSORegOpValue";
	// FIXME: Simplify the printer
	let PrintMethod = "printAddrMode2Operand";
	let DecoderMethod = "DecodeSORegMemOperand";
	let ParserMatchClass = MemRegOffsetAsmOperand;
	let MIOperandInfo = (ops GPR:$base, GPRnopc:$offsreg, i32imm:$shift);
	}

	// postidx_imm8 := +/- [0,255]
	//
	// 9 bit value:
	// {8} 1 is imm8 is non-negative. 0 otherwise.
	// {7-0} [0,255] imm8 value.
	def PostIdxImm8AsmOperand : AsmOperandClass { let Name = "PostIdxImm8"; }
	def postidx_imm8 : MemOperand {
	let PrintMethod = "printPostIdxImm8Operand";
	let ParserMatchClass = PostIdxImm8AsmOperand;
	let MIOperandInfo = (ops i32imm);
	}

	// postidx_imm8s4 := +/- [0,1020]
	//
	// 9 bit value:
	// {8} 1 is imm8 is non-negative. 0 otherwise.
	// {7-0} [0,255] imm8 value, scaled by 4.
	def PostIdxImm8s4AsmOperand : AsmOperandClass { let Name = "PostIdxImm8s4"; }
	def postidx_imm8s4 : MemOperand {
	let PrintMethod = "printPostIdxImm8s4Operand";
	let ParserMatchClass = PostIdxImm8s4AsmOperand;
	let MIOperandInfo = (ops i32imm);
	}


	// postidx_reg := +/- reg
	//
	def PostIdxRegAsmOperand : AsmOperandClass {
	let Name = "PostIdxReg";
	let ParserMethod = "parsePostIdxReg";
	}
	def postidx_reg : MemOperand {
	let EncoderMethod = "getPostIdxRegOpValue";
	let DecoderMethod = "DecodePostIdxReg";
	let PrintMethod = "printPostIdxRegOperand";
	let ParserMatchClass = PostIdxRegAsmOperand;
	let MIOperandInfo = (ops GPRnopc, i32imm);
	}


	// addrmode2 := reg +/- imm12
	// := reg +/- reg shop imm
	//
	// FIXME: addrmode2 should be refactored the rest of the way to always
	// use explicit imm vs. reg versions above (addrmode_imm12 and ldst_so_reg).
	def AddrMode2AsmOperand : AsmOperandClass { let Name = "AddrMode2"; }
	def addrmode2 : MemOperand,
	ComplexPattern<i32, 3, "SelectAddrMode2", []> {
	let EncoderMethod = "getAddrMode2OpValue";
	let PrintMethod = "printAddrMode2Operand";
	let ParserMatchClass = AddrMode2AsmOperand;
	let MIOperandInfo = (ops GPR:$base, GPR:$offsreg, i32imm:$offsimm);
	}

	def PostIdxRegShiftedAsmOperand : AsmOperandClass {
	let Name = "PostIdxRegShifted";
	let ParserMethod = "parsePostIdxReg";
	}
	def am2offset_reg : MemOperand,
	ComplexPattern<i32, 2, "SelectAddrMode2OffsetReg",
	[], [SDNPWantRoot]> {
	let EncoderMethod = "getAddrMode2OffsetOpValue";
	let PrintMethod = "printAddrMode2OffsetOperand";
	// When using this for assembly, it's always as a post-index offset.
	let ParserMatchClass = PostIdxRegShiftedAsmOperand;
	let MIOperandInfo = (ops GPRnopc, i32imm);
	}

	// FIXME: am2offset_imm should only need the immediate, not the GPR. Having
	// the GPR is purely vestigal at this point.
	def AM2OffsetImmAsmOperand : AsmOperandClass { let Name = "AM2OffsetImm"; }
	def am2offset_imm : MemOperand,
	ComplexPattern<i32, 2, "SelectAddrMode2OffsetImm",
	[], [SDNPWantRoot]> {
	let EncoderMethod = "getAddrMode2OffsetOpValue";
	let PrintMethod = "printAddrMode2OffsetOperand";
	let ParserMatchClass = AM2OffsetImmAsmOperand;
	let MIOperandInfo = (ops GPRnopc, i32imm);
	}


	// addrmode3 := reg +/- reg
	// addrmode3 := reg +/- imm8
	//
	// FIXME: split into imm vs. reg versions.
	def AddrMode3AsmOperand : AsmOperandClass { let Name = "AddrMode3"; }
	class AddrMode3 : MemOperand,
	ComplexPattern<i32, 3, "SelectAddrMode3", []> {
	let EncoderMethod = "getAddrMode3OpValue";
	let ParserMatchClass = AddrMode3AsmOperand;
	let MIOperandInfo = (ops GPR:$base, GPR:$offsreg, i32imm:$offsimm);
	}

	def addrmode3 : AddrMode3
	{
	let PrintMethod = "printAddrMode3Operand<false>";
	}

	def addrmode3_pre : AddrMode3
	{
	let PrintMethod = "printAddrMode3Operand<true>";
	}

	// FIXME: split into imm vs. reg versions.
	// FIXME: parser method to handle +/- register.
	def AM3OffsetAsmOperand : AsmOperandClass {
	let Name = "AM3Offset";
	let ParserMethod = "parseAM3Offset";
	}
	def am3offset : MemOperand,
	ComplexPattern<i32, 2, "SelectAddrMode3Offset",
	[], [SDNPWantRoot]> {
	let EncoderMethod = "getAddrMode3OffsetOpValue";
	let PrintMethod = "printAddrMode3OffsetOperand";
	let ParserMatchClass = AM3OffsetAsmOperand;
	let MIOperandInfo = (ops GPR, i32imm);
	}

	// ldstm_mode := {ia, ib, da, db}
	//
	def ldstm_mode : OptionalDefOperand<OtherVT, (ops i32), (ops (i32 1))> {
	let EncoderMethod = "getLdStmModeOpValue";
	let PrintMethod = "printLdStmModeOperand";
	}

	// addrmode5 := reg +/- imm8*4
	//
	def AddrMode5AsmOperand : AsmOperandClass { let Name = "AddrMode5"; }
	class AddrMode5 : MemOperand,
	ComplexPattern<i32, 2, "SelectAddrMode5", []> {
	let EncoderMethod = "getAddrMode5OpValue";
	let DecoderMethod = "DecodeAddrMode5Operand";
	let ParserMatchClass = AddrMode5AsmOperand;
	let MIOperandInfo = (ops GPR:$base, i32imm);
	}

	def addrmode5 : AddrMode5 {
	let PrintMethod = "printAddrMode5Operand<false>";
	}

	def addrmode5_pre : AddrMode5 {
	let PrintMethod = "printAddrMode5Operand<true>";
	}

	// addrmode5fp16 := reg +/- imm8*2
	//
	def AddrMode5FP16AsmOperand : AsmOperandClass { let Name = "AddrMode5FP16"; }
	class AddrMode5FP16 : Operand<i32>,
	ComplexPattern<i32, 2, "SelectAddrMode5FP16", []> {
	let EncoderMethod = "getAddrMode5FP16OpValue";
	let DecoderMethod = "DecodeAddrMode5FP16Operand";
	let ParserMatchClass = AddrMode5FP16AsmOperand;
	let MIOperandInfo = (ops GPR:$base, i32imm);
	}

	def addrmode5fp16 : AddrMode5FP16 {
	let PrintMethod = "printAddrMode5FP16Operand<false>";
	}

	// addrmode6 := reg with optional alignment
	//
	def AddrMode6AsmOperand : AsmOperandClass { let Name = "AlignedMemory"; }
	def addrmode6 : MemOperand,
	ComplexPattern<i32, 2, "SelectAddrMode6", [], [SDNPWantParent]>{
	let PrintMethod = "printAddrMode6Operand";
	let MIOperandInfo = (ops GPR:$addr, i32imm:$align);
	let EncoderMethod = "getAddrMode6AddressOpValue";
	let DecoderMethod = "DecodeAddrMode6Operand";
	let ParserMatchClass = AddrMode6AsmOperand;
	}

	def am6offset : MemOperand,
	ComplexPattern<i32, 1, "SelectAddrMode6Offset",
	[], [SDNPWantRoot]> {
	let PrintMethod = "printAddrMode6OffsetOperand";
	let MIOperandInfo = (ops GPR);
	let EncoderMethod = "getAddrMode6OffsetOpValue";
	let DecoderMethod = "DecodeGPRRegisterClass";
	}

	// Special version of addrmode6 to handle alignment encoding for VST1/VLD1
	// (single element from one lane) for size 32.
	def addrmode6oneL32 : MemOperand,
	ComplexPattern<i32, 2, "SelectAddrMode6", [], [SDNPWantParent]>{
	let PrintMethod = "printAddrMode6Operand";
	let MIOperandInfo = (ops GPR:$addr, i32imm);
	let EncoderMethod = "getAddrMode6OneLane32AddressOpValue";
	}

	// Base class for addrmode6 with specific alignment restrictions.
	class AddrMode6Align : MemOperand,
	ComplexPattern<i32, 2, "SelectAddrMode6", [], [SDNPWantParent]>{
	let PrintMethod = "printAddrMode6Operand";
	let MIOperandInfo = (ops GPR:$addr, i32imm:$align);
	let EncoderMethod = "getAddrMode6AddressOpValue";
	let DecoderMethod = "DecodeAddrMode6Operand";
	}

	// Special version of addrmode6 to handle no allowed alignment encoding for
	// VLD/VST instructions and checking the alignment is not specified.
	def AddrMode6AlignNoneAsmOperand : AsmOperandClass {
	let Name = "AlignedMemoryNone";
	let DiagnosticType = "AlignedMemoryRequiresNone";
	}
	def addrmode6alignNone : AddrMode6Align {
	// The alignment specifier can only be omitted.
	let ParserMatchClass = AddrMode6AlignNoneAsmOperand;
	}

	// Special version of addrmode6 to handle 16-bit alignment encoding for
	// VLD/VST instructions and checking the alignment value.
	def AddrMode6Align16AsmOperand : AsmOperandClass {
	let Name = "AlignedMemory16";
	let DiagnosticType = "AlignedMemoryRequires16";
	}
	def addrmode6align16 : AddrMode6Align {
	// The alignment specifier can only be 16 or omitted.
	let ParserMatchClass = AddrMode6Align16AsmOperand;
	}

	// Special version of addrmode6 to handle 32-bit alignment encoding for
	// VLD/VST instructions and checking the alignment value.
	def AddrMode6Align32AsmOperand : AsmOperandClass {
	let Name = "AlignedMemory32";
	let DiagnosticType = "AlignedMemoryRequires32";
	}
	def addrmode6align32 : AddrMode6Align {
	// The alignment specifier can only be 32 or omitted.
	let ParserMatchClass = AddrMode6Align32AsmOperand;
	}

	// Special version of addrmode6 to handle 64-bit alignment encoding for
	// VLD/VST instructions and checking the alignment value.
	def AddrMode6Align64AsmOperand : AsmOperandClass {
	let Name = "AlignedMemory64";
	let DiagnosticType = "AlignedMemoryRequires64";
	}
	def addrmode6align64 : AddrMode6Align {
	// The alignment specifier can only be 64 or omitted.
	let ParserMatchClass = AddrMode6Align64AsmOperand;
	}

	// Special version of addrmode6 to handle 64-bit or 128-bit alignment encoding
	// for VLD/VST instructions and checking the alignment value.
	def AddrMode6Align64or128AsmOperand : AsmOperandClass {
	let Name = "AlignedMemory64or128";
	let DiagnosticType = "AlignedMemoryRequires64or128";
	}
	def addrmode6align64or128 : AddrMode6Align {
	// The alignment specifier can only be 64, 128 or omitted.
	let ParserMatchClass = AddrMode6Align64or128AsmOperand;
	}

	// Special version of addrmode6 to handle 64-bit, 128-bit or 256-bit alignment
	// encoding for VLD/VST instructions and checking the alignment value.
	def AddrMode6Align64or128or256AsmOperand : AsmOperandClass {
	let Name = "AlignedMemory64or128or256";
	let DiagnosticType = "AlignedMemoryRequires64or128or256";
	}
	def addrmode6align64or128or256 : AddrMode6Align {
	// The alignment specifier can only be 64, 128, 256 or omitted.
	let ParserMatchClass = AddrMode6Align64or128or256AsmOperand;
	}

	// Special version of addrmode6 to handle alignment encoding for VLD-dup
	// instructions, specifically VLD4-dup.
	def addrmode6dup : MemOperand,
	ComplexPattern<i32, 2, "SelectAddrMode6", [], [SDNPWantParent]>{
	let PrintMethod = "printAddrMode6Operand";
	let MIOperandInfo = (ops GPR:$addr, i32imm);
	let EncoderMethod = "getAddrMode6DupAddressOpValue";
	// FIXME: This is close, but not quite right. The alignment specifier is
	// different.
	let ParserMatchClass = AddrMode6AsmOperand;
	}

	// Base class for addrmode6dup with specific alignment restrictions.
	class AddrMode6DupAlign : MemOperand,
	ComplexPattern<i32, 2, "SelectAddrMode6", [], [SDNPWantParent]>{
	let PrintMethod = "printAddrMode6Operand";
	let MIOperandInfo = (ops GPR:$addr, i32imm);
	let EncoderMethod = "getAddrMode6DupAddressOpValue";
	}

	// Special version of addrmode6 to handle no allowed alignment encoding for
	// VLD-dup instruction and checking the alignment is not specified.
	def AddrMode6dupAlignNoneAsmOperand : AsmOperandClass {
	let Name = "DupAlignedMemoryNone";
	let DiagnosticType = "DupAlignedMemoryRequiresNone";
	}
	def addrmode6dupalignNone : AddrMode6DupAlign {
	// The alignment specifier can only be omitted.
	let ParserMatchClass = AddrMode6dupAlignNoneAsmOperand;
	}

	// Special version of addrmode6 to handle 16-bit alignment encoding for VLD-dup
	// instruction and checking the alignment value.
	def AddrMode6dupAlign16AsmOperand : AsmOperandClass {
	let Name = "DupAlignedMemory16";
	let DiagnosticType = "DupAlignedMemoryRequires16";
	}
	def addrmode6dupalign16 : AddrMode6DupAlign {
	// The alignment specifier can only be 16 or omitted.
	let ParserMatchClass = AddrMode6dupAlign16AsmOperand;
	}

	// Special version of addrmode6 to handle 32-bit alignment encoding for VLD-dup
	// instruction and checking the alignment value.
	def AddrMode6dupAlign32AsmOperand : AsmOperandClass {
	let Name = "DupAlignedMemory32";
	let DiagnosticType = "DupAlignedMemoryRequires32";
	}
	def addrmode6dupalign32 : AddrMode6DupAlign {
	// The alignment specifier can only be 32 or omitted.
	let ParserMatchClass = AddrMode6dupAlign32AsmOperand;
	}

	// Special version of addrmode6 to handle 64-bit alignment encoding for VLD
	// instructions and checking the alignment value.
	def AddrMode6dupAlign64AsmOperand : AsmOperandClass {
	let Name = "DupAlignedMemory64";
	let DiagnosticType = "DupAlignedMemoryRequires64";
	}
	def addrmode6dupalign64 : AddrMode6DupAlign {
	// The alignment specifier can only be 64 or omitted.
	let ParserMatchClass = AddrMode6dupAlign64AsmOperand;
	}

	// Special version of addrmode6 to handle 64-bit or 128-bit alignment encoding
	// for VLD instructions and checking the alignment value.
	def AddrMode6dupAlign64or128AsmOperand : AsmOperandClass {
	let Name = "DupAlignedMemory64or128";
	let DiagnosticType = "DupAlignedMemoryRequires64or128";
	}
	def addrmode6dupalign64or128 : AddrMode6DupAlign {
	// The alignment specifier can only be 64, 128 or omitted.
	let ParserMatchClass = AddrMode6dupAlign64or128AsmOperand;
	}

	// addrmodepc := pc + reg
	//
	def addrmodepc : MemOperand,
	ComplexPattern<i32, 2, "SelectAddrModePC", []> {
	let PrintMethod = "printAddrModePCOperand";
	let MIOperandInfo = (ops GPR, i32imm);
	}

	// addr_offset_none := reg
	//
	def MemNoOffsetAsmOperand : AsmOperandClass { let Name = "MemNoOffset"; }
	def addr_offset_none : MemOperand,
	ComplexPattern<i32, 1, "SelectAddrOffsetNone", []> {
	let PrintMethod = "printAddrMode7Operand";
	let DecoderMethod = "DecodeAddrMode7Operand";
	let ParserMatchClass = MemNoOffsetAsmOperand;
	let MIOperandInfo = (ops GPR:$base);
	}

	def nohash_imm : Operand<i32> {
	let PrintMethod = "printNoHashImmediate";
	}

	def CoprocNumAsmOperand : AsmOperandClass {
	let Name = "CoprocNum";
	let ParserMethod = "parseCoprocNumOperand";
	}
	def p_imm : Operand<i32> {
	let PrintMethod = "printPImmediate";
	let ParserMatchClass = CoprocNumAsmOperand;
	let DecoderMethod = "DecodeCoprocessor";
	}

	def CoprocRegAsmOperand : AsmOperandClass {
	let Name = "CoprocReg";
	let ParserMethod = "parseCoprocRegOperand";
	}
	def c_imm : Operand<i32> {
	let PrintMethod = "printCImmediate";
	let ParserMatchClass = CoprocRegAsmOperand;
	}
	def CoprocOptionAsmOperand : AsmOperandClass {
	let Name = "CoprocOption";
	let ParserMethod = "parseCoprocOptionOperand";
	}
	def coproc_option_imm : Operand<i32> {
	let PrintMethod = "printCoprocOptionImm";
	let ParserMatchClass = CoprocOptionAsmOperand;
	}

	//===----------------------------------------------------------------------===//

	include "ARMInstrFormats.td"

	//===----------------------------------------------------------------------===//
	// Multiclass helpers...
	//

	/// AsI1_bin_irs - Defines a set of (op r, {mod_imm\|r\|so_reg}) patterns for a
	/// binop that produces a value.
	let TwoOperandAliasConstraint = "$Rn = $Rd" in
	multiclass AsI1_bin_irs<bits<4> opcod, string opc,
	InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
	SDPatternOperator opnode, bit Commutable = 0> {
	// The register-immediate version is re-materializable. This is useful
	// in particular for taking the address of a local.
	let isReMaterializable = 1 in {
	def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, mod_imm:$imm), DPFrm,
	iii, opc, "\t$Rd, $Rn, $imm",
	[(set GPR:$Rd, (opnode GPR:$Rn, mod_imm:$imm))]>,
	Sched<[WriteALU, ReadALU]> {
	bits<4> Rd;
	bits<4> Rn;
	bits<12> imm;
	let Inst{25} = 1;
	let Inst{19-16} = Rn;
	let Inst{15-12} = Rd;
	let Inst{11-0} = imm;
	}
	}
	def rr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm,
	iir, opc, "\t$Rd, $Rn, $Rm",
	[(set GPR:$Rd, (opnode GPR:$Rn, GPR:$Rm))]>,
	Sched<[WriteALU, ReadALU, ReadALU]> {
	bits<4> Rd;
	bits<4> Rn;
	bits<4> Rm;
	let Inst{25} = 0;
	let isCommutable = Commutable;
	let Inst{19-16} = Rn;
	let Inst{15-12} = Rd;
	let Inst{11-4} = 0b00000000;
	let Inst{3-0} = Rm;
	}

	def rsi : AsI1<opcod, (outs GPR:$Rd),
	(ins GPR:$Rn, so_reg_imm:$shift), DPSoRegImmFrm,
	iis, opc, "\t$Rd, $Rn, $shift",
	[(set GPR:$Rd, (opnode GPR:$Rn, so_reg_imm:$shift))]>,
	Sched<[WriteALUsi, ReadALU]> {
	bits<4> Rd;
	bits<4> Rn;
	bits<12> shift;
	let Inst{25} = 0;
	let Inst{19-16} = Rn;
	let Inst{15-12} = Rd;
	let Inst{11-5} = shift{11-5};
	let Inst{4} = 0;
	let Inst{3-0} = shift{3-0};
	}

	def rsr : AsI1<opcod, (outs GPR:$Rd),
	(ins GPR:$Rn, so_reg_reg:$shift), DPSoRegRegFrm,
	iis, opc, "\t$Rd, $Rn, $shift",
	[(set GPR:$Rd, (opnode GPR:$Rn, so_reg_reg:$shift))]>,
	Sched<[WriteALUsr, ReadALUsr]> {
	bits<4> Rd;
	bits<4> Rn;
	bits<12> shift;
	let Inst{25} = 0;
	let Inst{19-16} = Rn;
	let Inst{15-12} = Rd;
	let Inst{11-8} = shift{11-8};
	let Inst{7} = 0;
	let Inst{6-5} = shift{6-5};
	let Inst{4} = 1;
	let Inst{3-0} = shift{3-0};
	}
	}

	/// AsI1_rbin_irs - Same as AsI1_bin_irs except the order of operands are
	/// reversed. The 'rr' form is only defined for the disassembler; for codegen
	/// it is equivalent to the AsI1_bin_irs counterpart.
	let TwoOperandAliasConstraint = "$Rn = $Rd" in
	multiclass AsI1_rbin_irs<bits<4> opcod, string opc,
	InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
	SDNode opnode, bit Commutable = 0> {
	// The register-immediate version is re-materializable. This is useful
	// in particular for taking the address of a local.
	let isReMaterializable = 1 in {
	def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, mod_imm:$imm), DPFrm,
	iii, opc, "\t$Rd, $Rn, $imm",
	[(set GPR:$Rd, (opnode mod_imm:$imm, GPR:$Rn))]>,
	Sched<[WriteALU, ReadALU]> {
	bits<4> Rd;
	bits<4> Rn;
	bits<12> imm;
	let Inst{25} = 1;
	let Inst{19-16} = Rn;
	let Inst{15-12} = Rd;
	let Inst{11-0} = imm;
	}
	}
	def rr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm,
	iir, opc, "\t$Rd, $Rn, $Rm",
	[/* pattern left blank */]>,
	Sched<[WriteALU, ReadALU, ReadALU]> {
	bits<4> Rd;
	bits<4> Rn;
	bits<4> Rm;
	let Inst{11-4} = 0b00000000;
	let Inst{25} = 0;
	let Inst{3-0} = Rm;
	let Inst{15-12} = Rd;
	let Inst{19-16} = Rn;
	}

	def rsi : AsI1<opcod, (outs GPR:$Rd),
	(ins GPR:$Rn, so_reg_imm:$shift), DPSoRegImmFrm,
	iis, opc, "\t$Rd, $Rn, $shift",
	[(set GPR:$Rd, (opnode so_reg_imm:$shift, GPR:$Rn))]>,
	Sched<[WriteALUsi, ReadALU]> {
	bits<4> Rd;
	bits<4> Rn;
	bits<12> shift;
	let Inst{25} = 0;
	let Inst{19-16} = Rn;
	let Inst{15-12} = Rd;
	let Inst{11-5} = shift{11-5};
	let Inst{4} = 0;
	let Inst{3-0} = shift{3-0};
	}

	def rsr : AsI1<opcod, (outs GPR:$Rd),
	(ins GPR:$Rn, so_reg_reg:$shift), DPSoRegRegFrm,
	iis, opc, "\t$Rd, $Rn, $shift",
	[(set GPR:$Rd, (opnode so_reg_reg:$shift, GPR:$Rn))]>,
	Sched<[WriteALUsr, ReadALUsr]> {
	bits<4> Rd;
	bits<4> Rn;
	bits<12> shift;
	let Inst{25} = 0;
	let Inst{19-16} = Rn;
	let Inst{15-12} = Rd;
	let Inst{11-8} = shift{11-8};
	let Inst{7} = 0;
	let Inst{6-5} = shift{6-5};
	let Inst{4} = 1;
	let Inst{3-0} = shift{3-0};
	}
	}

	/// AsI1_bin_s_irs - Same as AsI1_bin_irs except it sets the 's' bit by default.
	///
	/// These opcodes will be converted to the real non-S opcodes by
	/// AdjustInstrPostInstrSelection after giving them an optional CPSR operand.
	let hasPostISelHook = 1, Defs = [CPSR] in {
	multiclass AsI1_bin_s_irs<InstrItinClass iii, InstrItinClass iir,
	InstrItinClass iis, SDNode opnode,
	bit Commutable = 0> {
	def ri : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, mod_imm:$imm, pred:$p),
	4, iii,
	[(set GPR:$Rd, CPSR, (opnode GPR:$Rn, mod_imm:$imm))]>,
	Sched<[WriteALU, ReadALU]>;

	def rr : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, pred:$p),
	4, iir,
	[(set GPR:$Rd, CPSR, (opnode GPR:$Rn, GPR:$Rm))]>,
	Sched<[WriteALU, ReadALU, ReadALU]> {
	let isCommutable = Commutable;
	}
	def rsi : ARMPseudoInst<(outs GPR:$Rd),
	(ins GPR:$Rn, so_reg_imm:$shift, pred:$p),
	4, iis,
	[(set GPR:$Rd, CPSR, (opnode GPR:$Rn,
	so_reg_imm:$shift))]>,
	Sched<[WriteALUsi, ReadALU]>;

	def rsr : ARMPseudoInst<(outs GPR:$Rd),
	(ins GPR:$Rn, so_reg_reg:$shift, pred:$p),
	4, iis,
	[(set GPR:$Rd, CPSR, (opnode GPR:$Rn,
	so_reg_reg:$shift))]>,
	Sched<[WriteALUSsr, ReadALUsr]>;
	}
	}

	/// AsI1_rbin_s_is - Same as AsI1_bin_s_irs, except selection DAG
	/// operands are reversed.
	let hasPostISelHook = 1, Defs = [CPSR] in {
	multiclass AsI1_rbin_s_is<InstrItinClass iii, InstrItinClass iir,
	InstrItinClass iis, SDNode opnode,
	bit Commutable = 0> {
	def ri : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, mod_imm:$imm, pred:$p),
	4, iii,
	[(set GPR:$Rd, CPSR, (opnode mod_imm:$imm, GPR:$Rn))]>,
	Sched<[WriteALU, ReadALU]>;

	def rsi : ARMPseudoInst<(outs GPR:$Rd),
	(ins GPR:$Rn, so_reg_imm:$shift, pred:$p),
	4, iis,
	[(set GPR:$Rd, CPSR, (opnode so_reg_imm:$shift,
	GPR:$Rn))]>,
	Sched<[WriteALUsi, ReadALU]>;

	def rsr : ARMPseudoInst<(outs GPR:$Rd),
	(ins GPR:$Rn, so_reg_reg:$shift, pred:$p),
	4, iis,
	[(set GPR:$Rd, CPSR, (opnode so_reg_reg:$shift,
	GPR:$Rn))]>,
	Sched<[WriteALUSsr, ReadALUsr]>;
	}
	}

	/// AI1_cmp_irs - Defines a set of (op r, {mod_imm\|r\|so_reg}) cmp / test
	/// patterns. Similar to AsI1_bin_irs except the instruction does not produce
	/// a explicit result, only implicitly set CPSR.
	let isCompare = 1, Defs = [CPSR] in {
	multiclass AI1_cmp_irs<bits<4> opcod, string opc,
	InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
	SDPatternOperator opnode, bit Commutable = 0,
	string rrDecoderMethod = ""> {
	def ri : AI1<opcod, (outs), (ins GPR:$Rn, mod_imm:$imm), DPFrm, iii,
	opc, "\t$Rn, $imm",
	[(opnode GPR:$Rn, mod_imm:$imm)]>,
	Sched<[WriteCMP, ReadALU]> {
	bits<4> Rn;
	bits<12> imm;
	let Inst{25} = 1;
	let Inst{20} = 1;
	let Inst{19-16} = Rn;
	let Inst{15-12} = 0b0000;
	let Inst{11-0} = imm;

	let Unpredictable{15-12} = 0b1111;
	}
	def rr : AI1<opcod, (outs), (ins GPR:$Rn, GPR:$Rm), DPFrm, iir,
	opc, "\t$Rn, $Rm",
	[(opnode GPR:$Rn, GPR:$Rm)]>,
	Sched<[WriteCMP, ReadALU, ReadALU]> {
	bits<4> Rn;
	bits<4> Rm;
	let isCommutable = Commutable;
	let Inst{25} = 0;
	let Inst{20} = 1;
	let Inst{19-16} = Rn;
	let Inst{15-12} = 0b0000;
	let Inst{11-4} = 0b00000000;
	let Inst{3-0} = Rm;
	let DecoderMethod = rrDecoderMethod;

	let Unpredictable{15-12} = 0b1111;
	}
	def rsi : AI1<opcod, (outs),
	(ins GPR:$Rn, so_reg_imm:$shift), DPSoRegImmFrm, iis,
	opc, "\t$Rn, $shift",
	[(opnode GPR:$Rn, so_reg_imm:$shift)]>,
	Sched<[WriteCMPsi, ReadALU]> {
	bits<4> Rn;
	bits<12> shift;
	let Inst{25} = 0;
	let Inst{20} = 1;
	let Inst{19-16} = Rn;
	let Inst{15-12} = 0b0000;
	let Inst{11-5} = shift{11-5};
	let Inst{4} = 0;
	let Inst{3-0} = shift{3-0};

	let Unpredictable{15-12} = 0b1111;
	}
	def rsr : AI1<opcod, (outs),
	(ins GPRnopc:$Rn, so_reg_reg:$shift), DPSoRegRegFrm, iis,
	opc, "\t$Rn, $shift",
	[(opnode GPRnopc:$Rn, so_reg_reg:$shift)]>,
	Sched<[WriteCMPsr, ReadALU]> {
	bits<4> Rn;
	bits<12> shift;
	let Inst{25} = 0;
	let Inst{20} = 1;
	let Inst{19-16} = Rn;
	let Inst{15-12} = 0b0000;
	let Inst{11-8} = shift{11-8};
	let Inst{7} = 0;
	let Inst{6-5} = shift{6-5};
	let Inst{4} = 1;
	let Inst{3-0} = shift{3-0};

	let Unpredictable{15-12} = 0b1111;
	}

	}
	}

	/// AI_ext_rrot - A unary operation with two forms: one whose operand is a
	/// register and one whose operand is a register rotated by 8/16/24.
	/// FIXME: Remove the 'r' variant. Its rot_imm is zero.
	class AI_ext_rrot<bits<8> opcod, string opc, PatFrag opnode>
	: AExtI<opcod, (outs GPRnopc:$Rd), (ins GPRnopc:$Rm, rot_imm:$rot),
	IIC_iEXTr, opc, "\t$Rd, $Rm$rot",
	[(set GPRnopc:$Rd, (opnode (rotr GPRnopc:$Rm, rot_imm:$rot)))]>,
	Requires<[IsARM, HasV6]>, Sched<[WriteALUsi]> {
	bits<4> Rd;
	bits<4> Rm;
	bits<2> rot;
	let Inst{19-16} = 0b1111;
	let Inst{15-12} = Rd;
	let Inst{11-10} = rot;
	let Inst{3-0} = Rm;
	}

	class AI_ext_rrot_np<bits<8> opcod, string opc>
	: AExtI<opcod, (outs GPRnopc:$Rd), (ins GPRnopc:$Rm, rot_imm:$rot),
	IIC_iEXTr, opc, "\t$Rd, $Rm$rot", []>,
	Requires<[IsARM, HasV6]>, Sched<[WriteALUsi]> {
	bits<2> rot;
	let Inst{19-16} = 0b1111;
	let Inst{11-10} = rot;
	}

	/// AI_exta_rrot - A binary operation with two forms: one whose operand is a
	/// register and one whose operand is a register rotated by 8/16/24.
	class AI_exta_rrot<bits<8> opcod, string opc, PatFrag opnode>
	: AExtI<opcod, (outs GPRnopc:$Rd), (ins GPR:$Rn, GPRnopc:$Rm, rot_imm:$rot),
	IIC_iEXTAr, opc, "\t$Rd, $Rn, $Rm$rot",
	[(set GPRnopc:$Rd, (opnode GPR:$Rn,
	(rotr GPRnopc:$Rm, rot_imm:$rot)))]>,
	Requires<[IsARM, HasV6]>, Sched<[WriteALUsr]> {
	bits<4> Rd;
	bits<4> Rm;
	bits<4> Rn;
	bits<2> rot;
	let Inst{19-16} = Rn;
	let Inst{15-12} = Rd;
	let Inst{11-10} = rot;
	let Inst{9-4} = 0b000111;
	let Inst{3-0} = Rm;
	}

	class AI_exta_rrot_np<bits<8> opcod, string opc>
	: AExtI<opcod, (outs GPRnopc:$Rd), (ins GPR:$Rn, GPRnopc:$Rm, rot_imm:$rot),
	IIC_iEXTAr, opc, "\t$Rd, $Rn, $Rm$rot", []>,
	Requires<[IsARM, HasV6]>, Sched<[WriteALUsr]> {
	bits<4> Rn;
	bits<2> rot;
	let Inst{19-16} = Rn;
	let Inst{11-10} = rot;
	}

	/// AI1_adde_sube_irs - Define instructions and patterns for adde and sube.
	let TwoOperandAliasConstraint = "$Rn = $Rd" in
	multiclass AI1_adde_sube_irs<bits<4> opcod, string opc, SDNode opnode,
	bit Commutable = 0> {
	let hasPostISelHook = 1, Defs = [CPSR], Uses = [CPSR] in {
	def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, mod_imm:$imm),
	DPFrm, IIC_iALUi, opc, "\t$Rd, $Rn, $imm",
	[(set GPR:$Rd, CPSR, (opnode GPR:$Rn, mod_imm:$imm, CPSR))]>,
	Requires<[IsARM]>,
	Sched<[WriteALU, ReadALU]> {
	bits<4> Rd;
	bits<4> Rn;
	bits<12> imm;
	let Inst{25} = 1;
	let Inst{15-12} = Rd;
	let Inst{19-16} = Rn;
	let Inst{11-0} = imm;
	}
	def rr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
	DPFrm, IIC_iALUr, opc, "\t$Rd, $Rn, $Rm",
	[(set GPR:$Rd, CPSR, (opnode GPR:$Rn, GPR:$Rm, CPSR))]>,
	Requires<[IsARM]>,
	Sched<[WriteALU, ReadALU, ReadALU]> {
	bits<4> Rd;
	bits<4> Rn;
	bits<4> Rm;
	let Inst{11-4} = 0b00000000;
	let Inst{25} = 0;
	let isCommutable = Commutable;
	let Inst{3-0} = Rm;
	let Inst{15-12} = Rd;
	let Inst{19-16} = Rn;
	}
	def rsi : AsI1<opcod, (outs GPR:$Rd),
	(ins GPR:$Rn, so_reg_imm:$shift),
	DPSoRegImmFrm, IIC_iALUsr, opc, "\t$Rd, $Rn, $shift",
	[(set GPR:$Rd, CPSR, (opnode GPR:$Rn, so_reg_imm:$shift, CPSR))]>,
	Requires<[IsARM]>,
	Sched<[WriteALUsi, ReadALU]> {
	bits<4> Rd;
	bits<4> Rn;
	bits<12> shift;
	let Inst{25} = 0;
	let Inst{19-16} = Rn;
	let Inst{15-12} = Rd;
	let Inst{11-5} = shift{11-5};
	let Inst{4} = 0;
	let Inst{3-0} = shift{3-0};
	}
	def rsr : AsI1<opcod, (outs GPRnopc:$Rd),
	(ins GPRnopc:$Rn, so_reg_reg:$shift),
	DPSoRegRegFrm, IIC_iALUsr, opc, "\t$Rd, $Rn, $shift",
	[(set GPRnopc:$Rd, CPSR,
	(opnode GPRnopc:$Rn, so_reg_reg:$shift, CPSR))]>,
	Requires<[IsARM]>,
	Sched<[WriteALUsr, ReadALUsr]> {
	bits<4> Rd;
	bits<4> Rn;
	bits<12> shift;
	let Inst{25} = 0;
	let Inst{19-16} = Rn;
	let Inst{15-12} = Rd;
	let Inst{11-8} = shift{11-8};
	let Inst{7} = 0;
	let Inst{6-5} = shift{6-5};
	let Inst{4} = 1;
	let Inst{3-0} = shift{3-0};
	}
	}
	}

	/// AI1_rsc_irs - Define instructions and patterns for rsc
	let TwoOperandAliasConstraint = "$Rn = $Rd" in
	multiclass AI1_rsc_irs<bits<4> opcod, string opc, SDNode opnode> {
	let hasPostISelHook = 1, Defs = [CPSR], Uses = [CPSR] in {
	def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, mod_imm:$imm),
	DPFrm, IIC_iALUi, opc, "\t$Rd, $Rn, $imm",
	[(set GPR:$Rd, CPSR, (opnode mod_imm:$imm, GPR:$Rn, CPSR))]>,
	Requires<[IsARM]>,
	Sched<[WriteALU, ReadALU]> {
	bits<4> Rd;
	bits<4> Rn;
	bits<12> imm;
	let Inst{25} = 1;
	let Inst{15-12} = Rd;
	let Inst{19-16} = Rn;
	let Inst{11-0} = imm;
	}
	def rr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
	DPFrm, IIC_iALUr, opc, "\t$Rd, $Rn, $Rm",
	[/* pattern left blank */]>,
	Sched<[WriteALU, ReadALU, ReadALU]> {
	bits<4> Rd;
	bits<4> Rn;
	bits<4> Rm;
	let Inst{11-4} = 0b00000000;
	let Inst{25} = 0;
	let Inst{3-0} = Rm;
	let Inst{15-12} = Rd;
	let Inst{19-16} = Rn;
	}
	def rsi : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_reg_imm:$shift),
	DPSoRegImmFrm, IIC_iALUsr, opc, "\t$Rd, $Rn, $shift",
	[(set GPR:$Rd, CPSR, (opnode so_reg_imm:$shift, GPR:$Rn, CPSR))]>,
	Requires<[IsARM]>,
	Sched<[WriteALUsi, ReadALU]> {
	bits<4> Rd;
	bits<4> Rn;
	bits<12> shift;
	let Inst{25} = 0;
	let Inst{19-16} = Rn;
	let Inst{15-12} = Rd;
	let Inst{11-5} = shift{11-5};
	let Inst{4} = 0;
	let Inst{3-0} = shift{3-0};
	}
	def rsr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_reg_reg:$shift),
	DPSoRegRegFrm, IIC_iALUsr, opc, "\t$Rd, $Rn, $shift",
	[(set GPR:$Rd, CPSR, (opnode so_reg_reg:$shift, GPR:$Rn, CPSR))]>,
	Requires<[IsARM]>,
	Sched<[WriteALUsr, ReadALUsr]> {
	bits<4> Rd;
	bits<4> Rn;
	bits<12> shift;
	let Inst{25} = 0;
	let Inst{19-16} = Rn;
	let Inst{15-12} = Rd;
	let Inst{11-8} = shift{11-8};
	let Inst{7} = 0;
	let Inst{6-5} = shift{6-5};
	let Inst{4} = 1;
	let Inst{3-0} = shift{3-0};
	}
	}
	}

	let canFoldAsLoad = 1, isReMaterializable = 1 in {
	multiclass AI_ldr1<bit isByte, string opc, InstrItinClass iii,
	InstrItinClass iir, PatFrag opnode> {
	// Note: We use the complex addrmode_imm12 rather than just an input
	// GPR and a constrained immediate so that we can use this to match
	// frame index references and avoid matching constant pool references.
	def i12: AI2ldst<0b010, 1, isByte, (outs GPR:$Rt), (ins addrmode_imm12:$addr),
	AddrMode_i12, LdFrm, iii, opc, "\t$Rt, $addr",
	[(set GPR:$Rt, (opnode addrmode_imm12:$addr))]> {
	bits<4> Rt;
	bits<17> addr;
	let Inst{23} = addr{12}; // U (add = ('U' == 1))
	let Inst{19-16} = addr{16-13}; // Rn
	let Inst{15-12} = Rt;
	let Inst{11-0} = addr{11-0}; // imm12
	}
	def rs : AI2ldst<0b011, 1, isByte, (outs GPR:$Rt), (ins ldst_so_reg:$shift),
	AddrModeNone, LdFrm, iir, opc, "\t$Rt, $shift",
	[(set GPR:$Rt, (opnode ldst_so_reg:$shift))]> {
	bits<4> Rt;
	bits<17> shift;
	let shift{4} = 0; // Inst{4} = 0
	let Inst{23} = shift{12}; // U (add = ('U' == 1))
	let Inst{19-16} = shift{16-13}; // Rn
	let Inst{15-12} = Rt;
	let Inst{11-0} = shift{11-0};
	}
	}
	}

	let canFoldAsLoad = 1, isReMaterializable = 1 in {
	multiclass AI_ldr1nopc<bit isByte, string opc, InstrItinClass iii,
	InstrItinClass iir, PatFrag opnode> {
	// Note: We use the complex addrmode_imm12 rather than just an input
	// GPR and a constrained immediate so that we can use this to match
	// frame index references and avoid matching constant pool references.
	def i12: AI2ldst<0b010, 1, isByte, (outs GPRnopc:$Rt),
	(ins addrmode_imm12:$addr),
	AddrMode_i12, LdFrm, iii, opc, "\t$Rt, $addr",
	[(set GPRnopc:$Rt, (opnode addrmode_imm12:$addr))]> {
	bits<4> Rt;
	bits<17> addr;
	let Inst{23} = addr{12}; // U (add = ('U' == 1))
	let Inst{19-16} = addr{16-13}; // Rn
	let Inst{15-12} = Rt;
	let Inst{11-0} = addr{11-0}; // imm12
	}
	def rs : AI2ldst<0b011, 1, isByte, (outs GPRnopc:$Rt),
	(ins ldst_so_reg:$shift),
	AddrModeNone, LdFrm, iir, opc, "\t$Rt, $shift",
	[(set GPRnopc:$Rt, (opnode ldst_so_reg:$shift))]> {
	bits<4> Rt;
	bits<17> shift;
	let shift{4} = 0; // Inst{4} = 0
	let Inst{23} = shift{12}; // U (add = ('U' == 1))
	let Inst{19-16} = shift{16-13}; // Rn
	let Inst{15-12} = Rt;
	let Inst{11-0} = shift{11-0};
	}
	}
	}


	multiclass AI_str1<bit isByte, string opc, InstrItinClass iii,
	InstrItinClass iir, PatFrag opnode> {
	// Note: We use the complex addrmode_imm12 rather than just an input
	// GPR and a constrained immediate so that we can use this to match
	// frame index references and avoid matching constant pool references.
	def i12 : AI2ldst<0b010, 0, isByte, (outs),
	(ins GPR:$Rt, addrmode_imm12:$addr),
	AddrMode_i12, StFrm, iii, opc, "\t$Rt, $addr",
	[(opnode GPR:$Rt, addrmode_imm12:$addr)]> {
	bits<4> Rt;
	bits<17> addr;
	let Inst{23} = addr{12}; // U (add = ('U' == 1))
	let Inst{19-16} = addr{16-13}; // Rn
	let Inst{15-12} = Rt;
	let Inst{11-0} = addr{11-0}; // imm12
	}
	def rs : AI2ldst<0b011, 0, isByte, (outs), (ins GPR:$Rt, ldst_so_reg:$shift),
	AddrModeNone, StFrm, iir, opc, "\t$Rt, $shift",
	[(opnode GPR:$Rt, ldst_so_reg:$shift)]> {
	bits<4> Rt;
	bits<17> shift;
	let shift{4} = 0; // Inst{4} = 0
	let Inst{23} = shift{12}; // U (add = ('U' == 1))
	let Inst{19-16} = shift{16-13}; // Rn
	let Inst{15-12} = Rt;
	let Inst{11-0} = shift{11-0};
	}
	}

	multiclass AI_str1nopc<bit isByte, string opc, InstrItinClass iii,
	InstrItinClass iir, PatFrag opnode> {
	// Note: We use the complex addrmode_imm12 rather than just an input
	// GPR and a constrained immediate so that we can use this to match
	// frame index references and avoid matching constant pool references.
	def i12 : AI2ldst<0b010, 0, isByte, (outs),
	(ins GPRnopc:$Rt, addrmode_imm12:$addr),
	AddrMode_i12, StFrm, iii, opc, "\t$Rt, $addr",
	[(opnode GPRnopc:$Rt, addrmode_imm12:$addr)]> {
	bits<4> Rt;
	bits<17> addr;
	let Inst{23} = addr{12}; // U (add = ('U' == 1))
	let Inst{19-16} = addr{16-13}; // Rn
	let Inst{15-12} = Rt;
	let Inst{11-0} = addr{11-0}; // imm12
	}
	def rs : AI2ldst<0b011, 0, isByte, (outs),
	(ins GPRnopc:$Rt, ldst_so_reg:$shift),
	AddrModeNone, StFrm, iir, opc, "\t$Rt, $shift",
	[(opnode GPRnopc:$Rt, ldst_so_reg:$shift)]> {
	bits<4> Rt;
	bits<17> shift;
	let shift{4} = 0; // Inst{4} = 0
	let Inst{23} = shift{12}; // U (add = ('U' == 1))
	let Inst{19-16} = shift{16-13}; // Rn
	let Inst{15-12} = Rt;
	let Inst{11-0} = shift{11-0};
	}
	}


	//===----------------------------------------------------------------------===//
	// Instructions
	//===----------------------------------------------------------------------===//

	//===----------------------------------------------------------------------===//
	// Miscellaneous Instructions.
	//

	/// CONSTPOOL_ENTRY - This instruction represents a floating constant pool in
	/// the function. The first operand is the ID# for this instruction, the second
	/// is the index into the MachineConstantPool that this is, the third is the
	/// size in bytes of this constant pool entry.
	let hasSideEffects = 0, isNotDuplicable = 1 in
	def CONSTPOOL_ENTRY :
	PseudoInst<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx,
	i32imm:$size), NoItinerary, []>;

	/// A jumptable consisting of direct 32-bit addresses of the destination basic
	/// blocks (either absolute, or relative to the start of the jump-table in PIC
	/// mode). Used mostly in ARM and Thumb-1 modes.
	def JUMPTABLE_ADDRS :
	PseudoInst<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx,
	i32imm:$size), NoItinerary, []>;

	/// A jumptable consisting of 32-bit jump instructions. Used for Thumb-2 tables
	/// that cannot be optimised to use TBB or TBH.
	def JUMPTABLE_INSTS :
	PseudoInst<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx,
	i32imm:$size), NoItinerary, []>;

	/// A jumptable consisting of 8-bit unsigned integers representing offsets from
	/// a TBB instruction.
	def JUMPTABLE_TBB :
	PseudoInst<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx,
	i32imm:$size), NoItinerary, []>;

	/// A jumptable consisting of 16-bit unsigned integers representing offsets from
	/// a TBH instruction.
	def JUMPTABLE_TBH :
	PseudoInst<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx,
	i32imm:$size), NoItinerary, []>;


	// FIXME: Marking these as hasSideEffects is necessary to prevent machine DCE
	// from removing one half of the matched pairs. That breaks PEI, which assumes
	// these will always be in pairs, and asserts if it finds otherwise. Better way?
	let Defs = [SP], Uses = [SP], hasSideEffects = 1 in {
	def ADJCALLSTACKUP :
	PseudoInst<(outs), (ins i32imm:$amt1, i32imm:$amt2, pred:$p), NoItinerary,
	[(ARMcallseq_end timm:$amt1, timm:$amt2)]>;

	def ADJCALLSTACKDOWN :
	PseudoInst<(outs), (ins i32imm:$amt, i32imm:$amt2, pred:$p), NoItinerary,
	[(ARMcallseq_start timm:$amt, timm:$amt2)]>;
	}

	def HINT : AI<(outs), (ins imm0_239:$imm), MiscFrm, NoItinerary,
	"hint", "\t$imm", [(int_arm_hint imm0_239:$imm)]>,
	Requires<[IsARM, HasV6]> {
	bits<8> imm;
	let Inst{27-8} = 0b00110010000011110000;
	let Inst{7-0} = imm;
	let DecoderMethod = "DecodeHINTInstruction";
	}

	def : InstAlias<"nop$p", (HINT 0, pred:$p)>, Requires<[IsARM, HasV6K]>;
	def : InstAlias<"yield$p", (HINT 1, pred:$p)>, Requires<[IsARM, HasV6K]>;
	def : InstAlias<"wfe$p", (HINT 2, pred:$p)>, Requires<[IsARM, HasV6K]>;
	def : InstAlias<"wfi$p", (HINT 3, pred:$p)>, Requires<[IsARM, HasV6K]>;
	def : InstAlias<"sev$p", (HINT 4, pred:$p)>, Requires<[IsARM, HasV6K]>;
	def : InstAlias<"sevl$p", (HINT 5, pred:$p)>, Requires<[IsARM, HasV8]>;
	def : InstAlias<"esb$p", (HINT 16, pred:$p)>, Requires<[IsARM, HasRAS]>;

	def SEL : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm, NoItinerary, "sel",
	"\t$Rd, $Rn, $Rm",
	[(set GPR:$Rd, (int_arm_sel GPR:$Rn, GPR:$Rm))]>,
	Requires<[IsARM, HasV6]> {
	bits<4> Rd;
	bits<4> Rn;
	bits<4> Rm;
	let Inst{3-0} = Rm;
	let Inst{15-12} = Rd;
	let Inst{19-16} = Rn;
	let Inst{27-20} = 0b01101000;
	let Inst{7-4} = 0b1011;
	let Inst{11-8} = 0b1111;
	let Unpredictable{11-8} = 0b1111;
	}

	// The 16-bit operand $val can be used by a debugger to store more information
	// about the breakpoint.
	def BKPT : AInoP<(outs), (ins imm0_65535:$val), MiscFrm, NoItinerary,
	"bkpt", "\t$val", []>, Requires<[IsARM]> {
	bits<16> val;
	let Inst{3-0} = val{3-0};
	let Inst{19-8} = val{15-4};
	let Inst{27-20} = 0b00010010;
	let Inst{31-28} = 0xe; // AL
	let Inst{7-4} = 0b0111;
	}
	// default immediate for breakpoint mnemonic
	def : InstAlias<"bkpt", (BKPT 0), 0>, Requires<[IsARM]>;

	def HLT : AInoP<(outs), (ins imm0_65535:$val), MiscFrm, NoItinerary,
	"hlt", "\t$val", []>, Requires<[IsARM, HasV8]> {
	bits<16> val;
	let Inst{3-0} = val{3-0};
	let Inst{19-8} = val{15-4};
	let Inst{27-20} = 0b00010000;
	let Inst{31-28} = 0xe; // AL
	let Inst{7-4} = 0b0111;
	}

	// Change Processor State
	// FIXME: We should use InstAlias to handle the optional operands.
	class CPS<dag iops, string asm_ops>
	: AXI<(outs), iops, MiscFrm, NoItinerary, !strconcat("cps", asm_ops),
	[]>, Requires<[IsARM]> {
	bits<2> imod;
	bits<3> iflags;
	bits<5> mode;
	bit M;

	let Inst{31-28} = 0b1111;
	let Inst{27-20} = 0b00010000;
	let Inst{19-18} = imod;
	let Inst{17} = M; // Enabled if mode is set;
	let Inst{16-9} = 0b00000000;
	let Inst{8-6} = iflags;
	let Inst{5} = 0;
	let Inst{4-0} = mode;
	}

	let DecoderMethod = "DecodeCPSInstruction" in {
	let M = 1 in
	def CPS3p : CPS<(ins imod_op:$imod, iflags_op:$iflags, imm0_31:$mode),
	"$imod\t$iflags, $mode">;
	let mode = 0, M = 0 in
	def CPS2p : CPS<(ins imod_op:$imod, iflags_op:$iflags), "$imod\t$iflags">;

	let imod = 0, iflags = 0, M = 1 in
	def CPS1p : CPS<(ins imm0_31:$mode), "\t$mode">;
	}

	// Preload signals the memory system of possible future data/instruction access.
	multiclass APreLoad<bits<1> read, bits<1> data, string opc> {

	def i12 : AXIM<(outs), (ins addrmode_imm12:$addr), AddrMode_i12, MiscFrm,
	IIC_Preload, !strconcat(opc, "\t$addr"),
	[(ARMPreload addrmode_imm12:$addr, (i32 read), (i32 data))]>,
	Sched<[WritePreLd]> {
	bits<4> Rt;
	bits<17> addr;
	let Inst{31-26} = 0b111101;
	let Inst{25} = 0; // 0 for immediate form
	let Inst{24} = data;
	let Inst{23} = addr{12}; // U (add = ('U' == 1))
	let Inst{22} = read;
	let Inst{21-20} = 0b01;
	let Inst{19-16} = addr{16-13}; // Rn
	let Inst{15-12} = 0b1111;
	let Inst{11-0} = addr{11-0}; // imm12
	}

	def rs : AXI<(outs), (ins ldst_so_reg:$shift), MiscFrm, IIC_Preload,
	!strconcat(opc, "\t$shift"),
	[(ARMPreload ldst_so_reg:$shift, (i32 read), (i32 data))]>,
	Sched<[WritePreLd]> {
	bits<17> shift;
	let Inst{31-26} = 0b111101;
	let Inst{25} = 1; // 1 for register form
	let Inst{24} = data;
	let Inst{23} = shift{12}; // U (add = ('U' == 1))
	let Inst{22} = read;
	let Inst{21-20} = 0b01;
	let Inst{19-16} = shift{16-13}; // Rn
	let Inst{15-12} = 0b1111;
	let Inst{11-0} = shift{11-0};
	let Inst{4} = 0;
	}
	}

	defm PLD : APreLoad<1, 1, "pld">, Requires<[IsARM]>;
	defm PLDW : APreLoad<0, 1, "pldw">, Requires<[IsARM,HasV7,HasMP]>;
	defm PLI : APreLoad<1, 0, "pli">, Requires<[IsARM,HasV7]>;

	def SETEND : AXI<(outs), (ins setend_op:$end), MiscFrm, NoItinerary,
	"setend\t$end", []>, Requires<[IsARM]>, Deprecated<HasV8Ops> {
	bits<1> end;
	let Inst{31-10} = 0b1111000100000001000000;
	let Inst{9} = end;
	let Inst{8-0} = 0;
	}

	def DBG : AI<(outs), (ins imm0_15:$opt), MiscFrm, NoItinerary, "dbg", "\t$opt",
	[(int_arm_dbg imm0_15:$opt)]>, Requires<[IsARM, HasV7]> {
	bits<4> opt;
	let Inst{27-4} = 0b001100100000111100001111;
	let Inst{3-0} = opt;
	}

	// A8.8.247 UDF - Undefined (Encoding A1)
	def UDF : AInoP<(outs), (ins imm0_65535:$imm16), MiscFrm, NoItinerary,
	"udf", "\t$imm16", [(int_arm_undefined imm0_65535:$imm16)]> {
	bits<16> imm16;
	let Inst{31-28} = 0b1110; // AL
	let Inst{27-25} = 0b011;
	let Inst{24-20} = 0b11111;
	let Inst{19-8} = imm16{15-4};
	let Inst{7-4} = 0b1111;
	let Inst{3-0} = imm16{3-0};
	}

	/*
	* A5.4 Permanently UNDEFINED instructions.
	*
	* For most targets use UDF #65006, for which the OS will generate SIGTRAP.
	* Other UDF encodings generate SIGILL.
	*
	* NaCl's OS instead chooses an ARM UDF encoding that's also a UDF in Thumb.
	* Encoding A1:
	* 1110 0111 1111 iiii iiii iiii 1111 iiii
	* Encoding T1:
	* 1101 1110 iiii iiii
	* It uses the following encoding:
	* 1110 0111 1111 1110 1101 1110 1111 0000
	* - In ARM: UDF #60896;
	* - In Thumb: UDF #254 followed by a branch-to-self.
	*/
	let isBarrier = 1, isTerminator = 1 in
	def TRAPNaCl : AXI<(outs), (ins), MiscFrm, NoItinerary,
	"trap", [(trap)]>,
	Requires<[IsARM,UseNaClTrap]> {
	let Inst = 0xe7fedef0;
	}
	let isBarrier = 1, isTerminator = 1 in
	def TRAP : AXI<(outs), (ins), MiscFrm, NoItinerary,
	"trap", [(trap)]>,
	Requires<[IsARM,DontUseNaClTrap]> {
	let Inst = 0xe7ffdefe;
	}

	// Address computation and loads and stores in PIC mode.
	let isNotDuplicable = 1 in {
	def PICADD : ARMPseudoInst<(outs GPR:$dst), (ins GPR:$a, pclabel:$cp, pred:$p),
	4, IIC_iALUr,
	[(set GPR:$dst, (ARMpic_add GPR:$a, imm:$cp))]>,
	Sched<[WriteALU, ReadALU]>;

	let AddedComplexity = 10 in {
	def PICLDR : ARMPseudoInst<(outs GPR:$dst), (ins addrmodepc:$addr, pred:$p),
	4, IIC_iLoad_r,
	[(set GPR:$dst, (load addrmodepc:$addr))]>;

	def PICLDRH : ARMPseudoInst<(outs GPR:$Rt), (ins addrmodepc:$addr, pred:$p),
	4, IIC_iLoad_bh_r,
	[(set GPR:$Rt, (zextloadi16 addrmodepc:$addr))]>;

	def PICLDRB : ARMPseudoInst<(outs GPR:$Rt), (ins addrmodepc:$addr, pred:$p),
	4, IIC_iLoad_bh_r,
	[(set GPR:$Rt, (zextloadi8 addrmodepc:$addr))]>;

	def PICLDRSH : ARMPseudoInst<(outs GPR:$Rt), (ins addrmodepc:$addr, pred:$p),
	4, IIC_iLoad_bh_r,
	[(set GPR:$Rt, (sextloadi16 addrmodepc:$addr))]>;

	def PICLDRSB : ARMPseudoInst<(outs GPR:$Rt), (ins addrmodepc:$addr, pred:$p),
	4, IIC_iLoad_bh_r,
	[(set GPR:$Rt, (sextloadi8 addrmodepc:$addr))]>;
	}
	let AddedComplexity = 10 in {
	def PICSTR : ARMPseudoInst<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p),
	4, IIC_iStore_r, [(store GPR:$src, addrmodepc:$addr)]>;

	def PICSTRH : ARMPseudoInst<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p),
	4, IIC_iStore_bh_r, [(truncstorei16 GPR:$src,
	addrmodepc:$addr)]>;

	def PICSTRB : ARMPseudoInst<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p),
	4, IIC_iStore_bh_r, [(truncstorei8 GPR:$src, addrmodepc:$addr)]>;
	}
	} // isNotDuplicable = 1


	// LEApcrel - Load a pc-relative address into a register without offending the
	// assembler.
	let hasSideEffects = 0, isReMaterializable = 1 in
	// The 'adr' mnemonic encodes differently if the label is before or after
	// the instruction. The {24-21} opcode bits are set by the fixup, as we don't
	// know until then which form of the instruction will be used.
	def ADR : AI1<{0,?,?,0}, (outs GPR:$Rd), (ins adrlabel:$label),
	MiscFrm, IIC_iALUi, "adr", "\t$Rd, $label", []>,
	Sched<[WriteALU, ReadALU]> {
	bits<4> Rd;
	bits<14> label;
	let Inst{27-25} = 0b001;
	let Inst{24} = 0;
	let Inst{23-22} = label{13-12};
	let Inst{21} = 0;
	let Inst{20} = 0;
	let Inst{19-16} = 0b1111;
	let Inst{15-12} = Rd;
	let Inst{11-0} = label{11-0};
	}

	let hasSideEffects = 1 in {
	def LEApcrel : ARMPseudoInst<(outs GPR:$Rd), (ins i32imm:$label, pred:$p),
	4, IIC_iALUi, []>, Sched<[WriteALU, ReadALU]>;

	def LEApcrelJT : ARMPseudoInst<(outs GPR:$Rd),
	(ins i32imm:$label, pred:$p),
	4, IIC_iALUi, []>, Sched<[WriteALU, ReadALU]>;
	}

	//===----------------------------------------------------------------------===//
	// Control Flow Instructions.
	//

	let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
	// ARMV4T and above
	def BX_RET : AI<(outs), (ins), BrMiscFrm, IIC_Br,
	"bx", "\tlr", [(ARMretflag)]>,
	Requires<[IsARM, HasV4T]>, Sched<[WriteBr]> {
	let Inst{27-0} = 0b0001001011111111111100011110;
	}

	// ARMV4 only
	def MOVPCLR : AI<(outs), (ins), BrMiscFrm, IIC_Br,
	"mov", "\tpc, lr", [(ARMretflag)]>,
	Requires<[IsARM, NoV4T]>, Sched<[WriteBr]> {
	let Inst{27-0} = 0b0001101000001111000000001110;
	}

	// Exception return: N.b. doesn't set CPSR as far as we're concerned (it sets
	// the user-space one).
	def SUBS_PC_LR : ARMPseudoInst<(outs), (ins i32imm:$offset, pred:$p),
	4, IIC_Br,
	[(ARMintretflag imm:$offset)]>;
	}

	// Indirect branches
	let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
	// ARMV4T and above
	def BX : AXI<(outs), (ins GPR:$dst), BrMiscFrm, IIC_Br, "bx\t$dst",
	[(brind GPR:$dst)]>,
	Requires<[IsARM, HasV4T]>, Sched<[WriteBr]> {
	bits<4> dst;
	let Inst{31-4} = 0b1110000100101111111111110001;
	let Inst{3-0} = dst;
	}

	def BX_pred : AI<(outs), (ins GPR:$dst), BrMiscFrm, IIC_Br,
	"bx", "\t$dst", [/* pattern left blank */]>,
	Requires<[IsARM, HasV4T]>, Sched<[WriteBr]> {
	bits<4> dst;
	let Inst{27-4} = 0b000100101111111111110001;
	let Inst{3-0} = dst;
	}
	}

	// SP is marked as a use to prevent stack-pointer assignments that appear
	// immediately before calls from potentially appearing dead.
	let isCall = 1,
	// FIXME: Do we really need a non-predicated version? If so, it should
	// at least be a pseudo instruction expanding to the predicated version
	// at MC lowering time.
	Defs = [LR], Uses = [SP] in {
	def BL : ABXI<0b1011, (outs), (ins arm_bl_target:$func),
	IIC_Br, "bl\t$func",
	[(ARMcall tglobaladdr:$func)]>,
	Requires<[IsARM]>, Sched<[WriteBrL]> {
	let Inst{31-28} = 0b1110;
	bits<24> func;
	let Inst{23-0} = func;
	let DecoderMethod = "DecodeBranchImmInstruction";
	}

	def BL_pred : ABI<0b1011, (outs), (ins arm_bl_target:$func),
	IIC_Br, "bl", "\t$func",
	[(ARMcall_pred tglobaladdr:$func)]>,
	Requires<[IsARM]>, Sched<[WriteBrL]> {
	bits<24> func;
	let Inst{23-0} = func;
	let DecoderMethod = "DecodeBranchImmInstruction";
	}

	// ARMv5T and above
	def BLX : AXI<(outs), (ins GPR:$func), BrMiscFrm,
	IIC_Br, "blx\t$func",
	[(ARMcall GPR:$func)]>,
	Requires<[IsARM, HasV5T]>, Sched<[WriteBrL]> {
	bits<4> func;
	let Inst{31-4} = 0b1110000100101111111111110011;
	let Inst{3-0} = func;
	}

	def BLX_pred : AI<(outs), (ins GPR:$func), BrMiscFrm,
	IIC_Br, "blx", "\t$func",
	[(ARMcall_pred GPR:$func)]>,
	Requires<[IsARM, HasV5T]>, Sched<[WriteBrL]> {
	bits<4> func;
	let Inst{27-4} = 0b000100101111111111110011;
	let Inst{3-0} = func;
	}

	// ARMv4T
	// Note: Restrict $func to the tGPR regclass to prevent it being in LR.
	def BX_CALL : ARMPseudoInst<(outs), (ins tGPR:$func),
	8, IIC_Br, [(ARMcall_nolink tGPR:$func)]>,
	Requires<[IsARM, HasV4T]>, Sched<[WriteBr]>;

	// ARMv4
	def BMOVPCRX_CALL : ARMPseudoInst<(outs), (ins tGPR:$func),
	8, IIC_Br, [(ARMcall_nolink tGPR:$func)]>,
	Requires<[IsARM, NoV4T]>, Sched<[WriteBr]>;

	// mov lr, pc; b if callee is marked noreturn to avoid confusing the
	// return stack predictor.
	def BMOVPCB_CALL : ARMPseudoInst<(outs), (ins arm_bl_target:$func),
	8, IIC_Br, [(ARMcall_nolink tglobaladdr:$func)]>,
	Requires<[IsARM]>, Sched<[WriteBr]>;
	}

	let isBranch = 1, isTerminator = 1 in {
	// FIXME: should be able to write a pattern for ARMBrcond, but can't use
	// a two-value operand where a dag node expects two operands. :(
	def Bcc : ABI<0b1010, (outs), (ins arm_br_target:$target),
	IIC_Br, "b", "\t$target",
	[/(ARMbrcond bb:$target, imm:$cc, CCR:$ccr)/]>,
	Sched<[WriteBr]> {
	bits<24> target;
	let Inst{23-0} = target;
	let DecoderMethod = "DecodeBranchImmInstruction";
	}

	let isBarrier = 1 in {
	// B is "predicable" since it's just a Bcc with an 'always' condition.
	let isPredicable = 1 in
	// FIXME: We shouldn't need this pseudo at all. Just using Bcc directly
	// should be sufficient.
	// FIXME: Is B really a Barrier? That doesn't seem right.
	def B : ARMPseudoExpand<(outs), (ins arm_br_target:$target), 4, IIC_Br,
	[(br bb:$target)], (Bcc arm_br_target:$target,
	(ops 14, zero_reg))>,
	Sched<[WriteBr]>;

	let Size = 4, isNotDuplicable = 1, isIndirectBranch = 1 in {
	def BR_JTr : ARMPseudoInst<(outs),
	(ins GPR:$target, i32imm:$jt),
	0, IIC_Br,
	[(ARMbrjt GPR:$target, tjumptable:$jt)]>,
	Sched<[WriteBr]>;
	// FIXME: This shouldn't use the generic "addrmode2," but rather be split
	// into i12 and rs suffixed versions.
	def BR_JTm : ARMPseudoInst<(outs),
	(ins addrmode2:$target, i32imm:$jt),
	0, IIC_Br,
	[(ARMbrjt (i32 (load addrmode2:$target)),
	tjumptable:$jt)]>, Sched<[WriteBrTbl]>;
	def BR_JTadd : ARMPseudoInst<(outs),
	(ins GPR:$target, GPR:$idx, i32imm:$jt),
	0, IIC_Br,
	[(ARMbrjt (add GPR:$target, GPR:$idx), tjumptable:$jt)]>,
	Sched<[WriteBrTbl]>;
	} // isNotDuplicable = 1, isIndirectBranch = 1
	} // isBarrier = 1

	}

	// BLX (immediate)
	def BLXi : AXI<(outs), (ins arm_blx_target:$target), BrMiscFrm, NoItinerary,
	"blx\t$target", []>,
	Requires<[IsARM, HasV5T]>, Sched<[WriteBrL]> {
	let Inst{31-25} = 0b1111101;
	bits<25> target;
	let Inst{23-0} = target{24-1};
	let Inst{24} = target{0};
	let isCall = 1;
	}

	// Branch and Exchange Jazelle
	def BXJ : ABI<0b0001, (outs), (ins GPR:$func), NoItinerary, "bxj", "\t$func",
	[/* pattern left blank */]>, Sched<[WriteBr]> {
	bits<4> func;
	let Inst{23-20} = 0b0010;
	let Inst{19-8} = 0xfff;
	let Inst{7-4} = 0b0010;
	let Inst{3-0} = func;
	let isBranch = 1;
	}

	// Tail calls.

	let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in {
	def TCRETURNdi : PseudoInst<(outs), (ins i32imm:$dst), IIC_Br, []>,
	Sched<[WriteBr]>;

	def TCRETURNri : PseudoInst<(outs), (ins tcGPR:$dst), IIC_Br, []>,
	Sched<[WriteBr]>;

	def TAILJMPd : ARMPseudoExpand<(outs), (ins arm_br_target:$dst),
	4, IIC_Br, [],
	(Bcc arm_br_target:$dst, (ops 14, zero_reg))>,
	Requires<[IsARM]>, Sched<[WriteBr]>;

	def TAILJMPr : ARMPseudoExpand<(outs), (ins tcGPR:$dst),
	4, IIC_Br, [],
	(BX GPR:$dst)>, Sched<[WriteBr]>,
	Requires<[IsARM]>;
	}

	// Secure Monitor Call is a system instruction.
	def SMC : ABI<0b0001, (outs), (ins imm0_15:$opt), NoItinerary, "smc", "\t$opt",
	[]>, Requires<[IsARM, HasTrustZone]> {
	bits<4> opt;
	let Inst{23-4} = 0b01100000000000000111;
	let Inst{3-0} = opt;
	}
	def : MnemonicAlias<"smi", "smc">;

	// Supervisor Call (Software Interrupt)
	let isCall = 1, Uses = [SP] in {
	def SVC : ABI<0b1111, (outs), (ins imm24b:$svc), IIC_Br, "svc", "\t$svc", []>,
	Sched<[WriteBr]> {
	bits<24> svc;
	let Inst{23-0} = svc;
	}
	}

	// Store Return State
	class SRSI<bit wb, string asm>
	: XI<(outs), (ins imm0_31:$mode), AddrModeNone, 4, IndexModeNone, BrFrm,
	NoItinerary, asm, "", []> {
	bits<5> mode;
	let Inst{31-28} = 0b1111;
	let Inst{27-25} = 0b100;
	let Inst{22} = 1;
	let Inst{21} = wb;
	let Inst{20} = 0;
	let Inst{19-16} = 0b1101; // SP
	let Inst{15-5} = 0b00000101000;
	let Inst{4-0} = mode;
	}

	def SRSDA : SRSI<0, "srsda\tsp, $mode"> {
	let Inst{24-23} = 0;
	}
	def SRSDA_UPD : SRSI<1, "srsda\tsp!, $mode"> {
	let Inst{24-23} = 0;
	}
	def SRSDB : SRSI<0, "srsdb\tsp, $mode"> {
	let Inst{24-23} = 0b10;
	}
	def SRSDB_UPD : SRSI<1, "srsdb\tsp!, $mode"> {
	let Inst{24-23} = 0b10;
	}
	def SRSIA : SRSI<0, "srsia\tsp, $mode"> {
	let Inst{24-23} = 0b01;
	}
	def SRSIA_UPD : SRSI<1, "srsia\tsp!, $mode"> {
	let Inst{24-23} = 0b01;
	}
	def SRSIB : SRSI<0, "srsib\tsp, $mode"> {
	let Inst{24-23} = 0b11;
	}
	def SRSIB_UPD : SRSI<1, "srsib\tsp!, $mode"> {
	let Inst{24-23} = 0b11;
	}

	def : ARMInstAlias<"srsda $mode", (SRSDA imm0_31:$mode)>;
	def : ARMInstAlias<"srsda $mode!", (SRSDA_UPD imm0_31:$mode)>;

	def : ARMInstAlias<"srsdb $mode", (SRSDB imm0_31:$mode)>;
	def : ARMInstAlias<"srsdb $mode!", (SRSDB_UPD imm0_31:$mode)>;

	def : ARMInstAlias<"srsia $mode", (SRSIA imm0_31:$mode)>;
	def : ARMInstAlias<"srsia $mode!", (SRSIA_UPD imm0_31:$mode)>;

	def : ARMInstAlias<"srsib $mode", (SRSIB imm0_31:$mode)>;
	def : ARMInstAlias<"srsib $mode!", (SRSIB_UPD imm0_31:$mode)>;

	// Return From Exception
	class RFEI<bit wb, string asm>
	: XI<(outs), (ins GPR:$Rn), AddrModeNone, 4, IndexModeNone, BrFrm,
	NoItinerary, asm, "", []> {
	bits<4> Rn;
	let Inst{31-28} = 0b1111;
	let Inst{27-25} = 0b100;
	let Inst{22} = 0;
	let Inst{21} = wb;
	let Inst{20} = 1;
	let Inst{19-16} = Rn;
	let Inst{15-0} = 0xa00;
	}

	def RFEDA : RFEI<0, "rfeda\t$Rn"> {
	let Inst{24-23} = 0;
	}
	def RFEDA_UPD : RFEI<1, "rfeda\t$Rn!"> {
	let Inst{24-23} = 0;
	}
	def RFEDB : RFEI<0, "rfedb\t$Rn"> {
	let Inst{24-23} = 0b10;
	}
	def RFEDB_UPD : RFEI<1, "rfedb\t$Rn!"> {
	let Inst{24-23} = 0b10;
	}
	def RFEIA : RFEI<0, "rfeia\t$Rn"> {
	let Inst{24-23} = 0b01;
	}
	def RFEIA_UPD : RFEI<1, "rfeia\t$Rn!"> {
	let Inst{24-23} = 0b01;
	}
	def RFEIB : RFEI<0, "rfeib\t$Rn"> {
	let Inst{24-23} = 0b11;
	}
	def RFEIB_UPD : RFEI<1, "rfeib\t$Rn!"> {
	let Inst{24-23} = 0b11;
	}

	// Hypervisor Call is a system instruction
	let isCall = 1 in {
	def HVC : AInoP< (outs), (ins imm0_65535:$imm), BrFrm, NoItinerary,
	"hvc", "\t$imm", []>,
	Requires<[IsARM, HasVirtualization]> {
	bits<16> imm;

	// Even though HVC isn't predicable, it's encoding includes a condition field.
	// The instruction is undefined if the condition field is 0xf otherwise it is
	// unpredictable if it isn't condition AL (0xe).
	let Inst{31-28} = 0b1110;
	let Unpredictable{31-28} = 0b1111;
	let Inst{27-24} = 0b0001;
	let Inst{23-20} = 0b0100;
	let Inst{19-8} = imm{15-4};
	let Inst{7-4} = 0b0111;
	let Inst{3-0} = imm{3-0};
	}
	}

	// Return from exception in Hypervisor mode.
	let isReturn = 1, isBarrier = 1, isTerminator = 1, Defs = [PC] in
	def ERET : ABI<0b0001, (outs), (ins), NoItinerary, "eret", "", []>,
	Requires<[IsARM, HasVirtualization]> {
	let Inst{23-0} = 0b011000000000000001101110;
	}

	//===----------------------------------------------------------------------===//
	// Load / Store Instructions.
	//

	// Load


	defm LDR : AI_ldr1<0, "ldr", IIC_iLoad_r, IIC_iLoad_si, load>;
	defm LDRB : AI_ldr1nopc<1, "ldrb", IIC_iLoad_bh_r, IIC_iLoad_bh_si,
	zextloadi8>;
	defm STR : AI_str1<0, "str", IIC_iStore_r, IIC_iStore_si, store>;
	defm STRB : AI_str1nopc<1, "strb", IIC_iStore_bh_r, IIC_iStore_bh_si,
	truncstorei8>;

	// Special LDR for loads from non-pc-relative constpools.
	let canFoldAsLoad = 1, mayLoad = 1, hasSideEffects = 0,
	isReMaterializable = 1, isCodeGenOnly = 1 in
	def LDRcp : AI2ldst<0b010, 1, 0, (outs GPR:$Rt), (ins addrmode_imm12:$addr),
	AddrMode_i12, LdFrm, IIC_iLoad_r, "ldr", "\t$Rt, $addr",
	[]> {
	bits<4> Rt;
	bits<17> addr;
	let Inst{23} = addr{12}; // U (add = ('U' == 1))
	let Inst{19-16} = 0b1111;
	let Inst{15-12} = Rt;
	let Inst{11-0} = addr{11-0}; // imm12
	}

	// Loads with zero extension
	def LDRH : AI3ld<0b1011, 1, (outs GPR:$Rt), (ins addrmode3:$addr), LdMiscFrm,
	IIC_iLoad_bh_r, "ldrh", "\t$Rt, $addr",
	[(set GPR:$Rt, (zextloadi16 addrmode3:$addr))]>;

	// Loads with sign extension
	def LDRSH : AI3ld<0b1111, 1, (outs GPR:$Rt), (ins addrmode3:$addr), LdMiscFrm,
	IIC_iLoad_bh_r, "ldrsh", "\t$Rt, $addr",
	[(set GPR:$Rt, (sextloadi16 addrmode3:$addr))]>;

	def LDRSB : AI3ld<0b1101, 1, (outs GPR:$Rt), (ins addrmode3:$addr), LdMiscFrm,
	IIC_iLoad_bh_r, "ldrsb", "\t$Rt, $addr",
	[(set GPR:$Rt, (sextloadi8 addrmode3:$addr))]>;

	let mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 in {
	// Load doubleword
	def LDRD : AI3ld<0b1101, 0, (outs GPR:$Rt, GPR:$Rt2), (ins addrmode3:$addr),
	LdMiscFrm, IIC_iLoad_d_r, "ldrd", "\t$Rt, $Rt2, $addr", []>,
	Requires<[IsARM, HasV5TE]>;
	}

	def LDA : AIldracq<0b00, (outs GPR:$Rt), (ins addr_offset_none:$addr),
	NoItinerary, "lda", "\t$Rt, $addr", []>;
	def LDAB : AIldracq<0b10, (outs GPR:$Rt), (ins addr_offset_none:$addr),
	NoItinerary, "ldab", "\t$Rt, $addr", []>;
	def LDAH : AIldracq<0b11, (outs GPR:$Rt), (ins addr_offset_none:$addr),
	NoItinerary, "ldah", "\t$Rt, $addr", []>;

	// Indexed loads
	multiclass AI2_ldridx<bit isByte, string opc,
	InstrItinClass iii, InstrItinClass iir> {
	def _PRE_IMM : AI2ldstidx<1, isByte, 1, (outs GPR:$Rt, GPR:$Rn_wb),
	(ins addrmode_imm12_pre:$addr), IndexModePre, LdFrm, iii,
	opc, "\t$Rt, $addr!", "$addr.base = $Rn_wb", []> {
	bits<17> addr;
	let Inst{25} = 0;
	let Inst{23} = addr{12};
	let Inst{19-16} = addr{16-13};
	let Inst{11-0} = addr{11-0};
	let DecoderMethod = "DecodeLDRPreImm";
	}

	def _PRE_REG : AI2ldstidx<1, isByte, 1, (outs GPR:$Rt, GPR:$Rn_wb),
	(ins ldst_so_reg:$addr), IndexModePre, LdFrm, iir,
	opc, "\t$Rt, $addr!", "$addr.base = $Rn_wb", []> {
	bits<17> addr;
	let Inst{25} = 1;
	let Inst{23} = addr{12};
	let Inst{19-16} = addr{16-13};
	let Inst{11-0} = addr{11-0};
	let Inst{4} = 0;
	let DecoderMethod = "DecodeLDRPreReg";
	}

	def _POST_REG : AI2ldstidx<1, isByte, 0, (outs GPR:$Rt, GPR:$Rn_wb),
	(ins addr_offset_none:$addr, am2offset_reg:$offset),
	IndexModePost, LdFrm, iir,
	opc, "\t$Rt, $addr, $offset",
	"$addr.base = $Rn_wb", []> {
	// {12} isAdd
	// {11-0} imm12/Rm
	bits<14> offset;
	bits<4> addr;
	let Inst{25} = 1;
	let Inst{23} = offset{12};
	let Inst{19-16} = addr;
	let Inst{11-0} = offset{11-0};
	let Inst{4} = 0;

	let DecoderMethod = "DecodeAddrMode2IdxInstruction";
	}

	def _POST_IMM : AI2ldstidx<1, isByte, 0, (outs GPR:$Rt, GPR:$Rn_wb),
	(ins addr_offset_none:$addr, am2offset_imm:$offset),
	IndexModePost, LdFrm, iii,
	opc, "\t$Rt, $addr, $offset",
	"$addr.base = $Rn_wb", []> {
	// {12} isAdd
	// {11-0} imm12/Rm
	bits<14> offset;
	bits<4> addr;
	let Inst{25} = 0;
	let Inst{23} = offset{12};
	let Inst{19-16} = addr;
	let Inst{11-0} = offset{11-0};

	let DecoderMethod = "DecodeAddrMode2IdxInstruction";
	}

	}

	let mayLoad = 1, hasSideEffects = 0 in {
	// FIXME: for LDR_PRE_REG etc. the itineray should be either IIC_iLoad_ru or
	// IIC_iLoad_siu depending on whether it the offset register is shifted.
	defm LDR : AI2_ldridx<0, "ldr", IIC_iLoad_iu, IIC_iLoad_ru>;
	defm LDRB : AI2_ldridx<1, "ldrb", IIC_iLoad_bh_iu, IIC_iLoad_bh_ru>;
	}

	multiclass AI3_ldridx<bits<4> op, string opc, InstrItinClass itin> {
	def _PRE : AI3ldstidx<op, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
	(ins addrmode3_pre:$addr), IndexModePre,
	LdMiscFrm, itin,
	opc, "\t$Rt, $addr!", "$addr.base = $Rn_wb", []> {
	bits<14> addr;
	let Inst{23} = addr{8}; // U bit
	let Inst{22} = addr{13}; // 1 == imm8, 0 == Rm
	let Inst{19-16} = addr{12-9}; // Rn
	let Inst{11-8} = addr{7-4}; // imm7_4/zero
	let Inst{3-0} = addr{3-0}; // imm3_0/Rm
	let DecoderMethod = "DecodeAddrMode3Instruction";
	}
	def _POST : AI3ldstidx<op, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
	(ins addr_offset_none:$addr, am3offset:$offset),
	IndexModePost, LdMiscFrm, itin,
	opc, "\t$Rt, $addr, $offset", "$addr.base = $Rn_wb",
	[]> {
	bits<10> offset;
	bits<4> addr;
	let Inst{23} = offset{8}; // U bit
	let Inst{22} = offset{9}; // 1 == imm8, 0 == Rm
	let Inst{19-16} = addr;
	let Inst{11-8} = offset{7-4}; // imm7_4/zero
	let Inst{3-0} = offset{3-0}; // imm3_0/Rm
	let DecoderMethod = "DecodeAddrMode3Instruction";
	}
	}

	let mayLoad = 1, hasSideEffects = 0 in {
	defm LDRH : AI3_ldridx<0b1011, "ldrh", IIC_iLoad_bh_ru>;
	defm LDRSH : AI3_ldridx<0b1111, "ldrsh", IIC_iLoad_bh_ru>;
	defm LDRSB : AI3_ldridx<0b1101, "ldrsb", IIC_iLoad_bh_ru>;
	let hasExtraDefRegAllocReq = 1 in {
	def LDRD_PRE : AI3ldstidx<0b1101, 0, 1, (outs GPR:$Rt, GPR:$Rt2, GPR:$Rn_wb),
	(ins addrmode3_pre:$addr), IndexModePre,
	LdMiscFrm, IIC_iLoad_d_ru,
	"ldrd", "\t$Rt, $Rt2, $addr!",
	"$addr.base = $Rn_wb", []> {
	bits<14> addr;
	let Inst{23} = addr{8}; // U bit
	let Inst{22} = addr{13}; // 1 == imm8, 0 == Rm
	let Inst{19-16} = addr{12-9}; // Rn
	let Inst{11-8} = addr{7-4}; // imm7_4/zero
	let Inst{3-0} = addr{3-0}; // imm3_0/Rm
	let DecoderMethod = "DecodeAddrMode3Instruction";
	}
	def LDRD_POST: AI3ldstidx<0b1101, 0, 0, (outs GPR:$Rt, GPR:$Rt2, GPR:$Rn_wb),
	(ins addr_offset_none:$addr, am3offset:$offset),
	IndexModePost, LdMiscFrm, IIC_iLoad_d_ru,
	"ldrd", "\t$Rt, $Rt2, $addr, $offset",
	"$addr.base = $Rn_wb", []> {
	bits<10> offset;
	bits<4> addr;
	let Inst{23} = offset{8}; // U bit
	let Inst{22} = offset{9}; // 1 == imm8, 0 == Rm
	let Inst{19-16} = addr;
	let Inst{11-8} = offset{7-4}; // imm7_4/zero
	let Inst{3-0} = offset{3-0}; // imm3_0/Rm
	let DecoderMethod = "DecodeAddrMode3Instruction";
	}
	} // hasExtraDefRegAllocReq = 1
	} // mayLoad = 1, hasSideEffects = 0

	// LDRT, LDRBT, LDRSBT, LDRHT, LDRSHT.
	let mayLoad = 1, hasSideEffects = 0 in {
	def LDRT_POST_REG : AI2ldstidx<1, 0, 0, (outs GPR:$Rt, GPR:$Rn_wb),
	(ins addr_offset_none:$addr, am2offset_reg:$offset),
	IndexModePost, LdFrm, IIC_iLoad_ru,
	"ldrt", "\t$Rt, $addr, $offset",
	"$addr.base = $Rn_wb", []> {
	// {12} isAdd
	// {11-0} imm12/Rm
	bits<14> offset;
	bits<4> addr;
	let Inst{25} = 1;
	let Inst{23} = offset{12};
	let Inst{21} = 1; // overwrite
	let Inst{19-16} = addr;
	let Inst{11-5} = offset{11-5};
	let Inst{4} = 0;
	let Inst{3-0} = offset{3-0};
	let DecoderMethod = "DecodeAddrMode2IdxInstruction";
	}

	def LDRT_POST_IMM
	: AI2ldstidx<1, 0, 0, (outs GPR:$Rt, GPR:$Rn_wb),
	(ins addr_offset_none:$addr, am2offset_imm:$offset),
	IndexModePost, LdFrm, IIC_iLoad_ru,
	"ldrt", "\t$Rt, $addr, $offset", "$addr.base = $Rn_wb", []> {
	// {12} isAdd
	// {11-0} imm12/Rm
	bits<14> offset;
	bits<4> addr;
	let Inst{25} = 0;
	let Inst{23} = offset{12};
	let Inst{21} = 1; // overwrite
	let Inst{19-16} = addr;
	let Inst{11-0} = offset{11-0};
	let DecoderMethod = "DecodeAddrMode2IdxInstruction";
	}

	def LDRBT_POST_REG : AI2ldstidx<1, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
	(ins addr_offset_none:$addr, am2offset_reg:$offset),
	IndexModePost, LdFrm, IIC_iLoad_bh_ru,
	"ldrbt", "\t$Rt, $addr, $offset",
	"$addr.base = $Rn_wb", []> {
	// {12} isAdd
	// {11-0} imm12/Rm
	bits<14> offset;
	bits<4> addr;
	let Inst{25} = 1;
	let Inst{23} = offset{12};
	let Inst{21} = 1; // overwrite
	let Inst{19-16} = addr;
	let Inst{11-5} = offset{11-5};
	let Inst{4} = 0;
	let Inst{3-0} = offset{3-0};
	let DecoderMethod = "DecodeAddrMode2IdxInstruction";
	}

	def LDRBT_POST_IMM
	: AI2ldstidx<1, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
	(ins addr_offset_none:$addr, am2offset_imm:$offset),
	IndexModePost, LdFrm, IIC_iLoad_bh_ru,
	"ldrbt", "\t$Rt, $addr, $offset", "$addr.base = $Rn_wb", []> {
	// {12} isAdd
	// {11-0} imm12/Rm
	bits<14> offset;
	bits<4> addr;
	let Inst{25} = 0;
	let Inst{23} = offset{12};
	let Inst{21} = 1; // overwrite
	let Inst{19-16} = addr;
	let Inst{11-0} = offset{11-0};
	let DecoderMethod = "DecodeAddrMode2IdxInstruction";
	}

	multiclass AI3ldrT<bits<4> op, string opc> {
	def i : AI3ldstidxT<op, 1, (outs GPR:$Rt, GPR:$base_wb),
	(ins addr_offset_none:$addr, postidx_imm8:$offset),
	IndexModePost, LdMiscFrm, IIC_iLoad_bh_ru, opc,
	"\t$Rt, $addr, $offset", "$addr.base = $base_wb", []> {
	bits<9> offset;
	let Inst{23} = offset{8};
	let Inst{22} = 1;
	let Inst{11-8} = offset{7-4};
	let Inst{3-0} = offset{3-0};
	}
	def r : AI3ldstidxT<op, 1, (outs GPRnopc:$Rt, GPRnopc:$base_wb),
	(ins addr_offset_none:$addr, postidx_reg:$Rm),
	IndexModePost, LdMiscFrm, IIC_iLoad_bh_ru, opc,
	"\t$Rt, $addr, $Rm", "$addr.base = $base_wb", []> {
	bits<5> Rm;
	let Inst{23} = Rm{4};
	let Inst{22} = 0;
	let Inst{11-8} = 0;
	let Unpredictable{11-8} = 0b1111;
	let Inst{3-0} = Rm{3-0};
	let DecoderMethod = "DecodeLDR";
	}
	}

	defm LDRSBT : AI3ldrT<0b1101, "ldrsbt">;
	defm LDRHT : AI3ldrT<0b1011, "ldrht">;
	defm LDRSHT : AI3ldrT<0b1111, "ldrsht">;
	}

	def LDRT_POST
	: ARMAsmPseudo<"ldrt${q} $Rt, $addr", (ins addr_offset_none:$addr, pred:$q),
	(outs GPR:$Rt)>;

	def LDRBT_POST
	: ARMAsmPseudo<"ldrbt${q} $Rt, $addr", (ins addr_offset_none:$addr, pred:$q),
	(outs GPR:$Rt)>;

	// Pseudo instruction ldr Rt, =immediate
	def LDRConstPool
	: ARMAsmPseudo<"ldr${q} $Rt, $immediate",
	(ins const_pool_asm_imm:$immediate, pred:$q),
	(outs GPR:$Rt)>;

	// Store

	// Stores with truncate
	def STRH : AI3str<0b1011, (outs), (ins GPR:$Rt, addrmode3:$addr), StMiscFrm,
	IIC_iStore_bh_r, "strh", "\t$Rt, $addr",
	[(truncstorei16 GPR:$Rt, addrmode3:$addr)]>;

	// Store doubleword
	let mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 in {
	def STRD : AI3str<0b1111, (outs), (ins GPR:$Rt, GPR:$Rt2, addrmode3:$addr),
	StMiscFrm, IIC_iStore_d_r, "strd", "\t$Rt, $Rt2, $addr", []>,
	Requires<[IsARM, HasV5TE]> {
	let Inst{21} = 0;
	}
	}

	// Indexed stores
	multiclass AI2_stridx<bit isByte, string opc,
	InstrItinClass iii, InstrItinClass iir> {
	def _PRE_IMM : AI2ldstidx<0, isByte, 1, (outs GPR:$Rn_wb),
	(ins GPR:$Rt, addrmode_imm12_pre:$addr), IndexModePre,
	StFrm, iii,
	opc, "\t$Rt, $addr!",
	"$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []> {
	bits<17> addr;
	let Inst{25} = 0;
	let Inst{23} = addr{12}; // U (add = ('U' == 1))
	let Inst{19-16} = addr{16-13}; // Rn
	let Inst{11-0} = addr{11-0}; // imm12
	let DecoderMethod = "DecodeSTRPreImm";
	}

	def _PRE_REG : AI2ldstidx<0, isByte, 1, (outs GPR:$Rn_wb),
	(ins GPR:$Rt, ldst_so_reg:$addr),
	IndexModePre, StFrm, iir,
	opc, "\t$Rt, $addr!",
	"$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []> {
	bits<17> addr;
	let Inst{25} = 1;
	let Inst{23} = addr{12}; // U (add = ('U' == 1))
	let Inst{19-16} = addr{16-13}; // Rn
	let Inst{11-0} = addr{11-0};
	let Inst{4} = 0; // Inst{4} = 0
	let DecoderMethod = "DecodeSTRPreReg";
	}
	def _POST_REG : AI2ldstidx<0, isByte, 0, (outs GPR:$Rn_wb),
	(ins GPR:$Rt, addr_offset_none:$addr, am2offset_reg:$offset),
	IndexModePost, StFrm, iir,
	opc, "\t$Rt, $addr, $offset",
	"$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []> {
	// {12} isAdd
	// {11-0} imm12/Rm
	bits<14> offset;
	bits<4> addr;
	let Inst{25} = 1;
	let Inst{23} = offset{12};
	let Inst{19-16} = addr;
	let Inst{11-0} = offset{11-0};
	let Inst{4} = 0;

	let DecoderMethod = "DecodeAddrMode2IdxInstruction";
	}

	def _POST_IMM : AI2ldstidx<0, isByte, 0, (outs GPR:$Rn_wb),
	(ins GPR:$Rt, addr_offset_none:$addr, am2offset_imm:$offset),
	IndexModePost, StFrm, iii,
	opc, "\t$Rt, $addr, $offset",
	"$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []> {
	// {12} isAdd
	// {11-0} imm12/Rm
	bits<14> offset;
	bits<4> addr;
	let Inst{25} = 0;
	let Inst{23} = offset{12};
	let Inst{19-16} = addr;
	let Inst{11-0} = offset{11-0};

	let DecoderMethod = "DecodeAddrMode2IdxInstruction";
	}
	}

	let mayStore = 1, hasSideEffects = 0 in {
	// FIXME: for STR_PRE_REG etc. the itineray should be either IIC_iStore_ru or
	// IIC_iStore_siu depending on whether it the offset register is shifted.
	defm STR : AI2_stridx<0, "str", IIC_iStore_iu, IIC_iStore_ru>;
	defm STRB : AI2_stridx<1, "strb", IIC_iStore_bh_iu, IIC_iStore_bh_ru>;
	}

	def : ARMPat<(post_store GPR:$Rt, addr_offset_none:$addr,
	am2offset_reg:$offset),
	(STR_POST_REG GPR:$Rt, addr_offset_none:$addr,
	am2offset_reg:$offset)>;
	def : ARMPat<(post_store GPR:$Rt, addr_offset_none:$addr,
	am2offset_imm:$offset),
	(STR_POST_IMM GPR:$Rt, addr_offset_none:$addr,
	am2offset_imm:$offset)>;
	def : ARMPat<(post_truncsti8 GPR:$Rt, addr_offset_none:$addr,
	am2offset_reg:$offset),
	(STRB_POST_REG GPR:$Rt, addr_offset_none:$addr,
	am2offset_reg:$offset)>;
	def : ARMPat<(post_truncsti8 GPR:$Rt, addr_offset_none:$addr,
	am2offset_imm:$offset),
	(STRB_POST_IMM GPR:$Rt, addr_offset_none:$addr,
	am2offset_imm:$offset)>;

	// Pseudo-instructions for pattern matching the pre-indexed stores. We can't
	// put the patterns on the instruction definitions directly as ISel wants
	// the address base and offset to be separate operands, not a single
	// complex operand like we represent the instructions themselves. The
	// pseudos map between the two.
	let usesCustomInserter = 1,
	Constraints = "$Rn = $Rn_wb,@earlyclobber $Rn_wb" in {
	def STRi_preidx: ARMPseudoInst<(outs GPR:$Rn_wb),
	(ins GPR:$Rt, GPR:$Rn, am2offset_imm:$offset, pred:$p),
	4, IIC_iStore_ru,
	[(set GPR:$Rn_wb,
	(pre_store GPR:$Rt, GPR:$Rn, am2offset_imm:$offset))]>;
	def STRr_preidx: ARMPseudoInst<(outs GPR:$Rn_wb),
	(ins GPR:$Rt, GPR:$Rn, am2offset_reg:$offset, pred:$p),
	4, IIC_iStore_ru,
	[(set GPR:$Rn_wb,
	(pre_store GPR:$Rt, GPR:$Rn, am2offset_reg:$offset))]>;
	def STRBi_preidx: ARMPseudoInst<(outs GPR:$Rn_wb),
	(ins GPR:$Rt, GPR:$Rn, am2offset_imm:$offset, pred:$p),
	4, IIC_iStore_ru,
	[(set GPR:$Rn_wb,
	(pre_truncsti8 GPR:$Rt, GPR:$Rn, am2offset_imm:$offset))]>;
	def STRBr_preidx: ARMPseudoInst<(outs GPR:$Rn_wb),
	(ins GPR:$Rt, GPR:$Rn, am2offset_reg:$offset, pred:$p),
	4, IIC_iStore_ru,
	[(set GPR:$Rn_wb,
	(pre_truncsti8 GPR:$Rt, GPR:$Rn, am2offset_reg:$offset))]>;
	def STRH_preidx: ARMPseudoInst<(outs GPR:$Rn_wb),
	(ins GPR:$Rt, GPR:$Rn, am3offset:$offset, pred:$p),
	4, IIC_iStore_ru,
	[(set GPR:$Rn_wb,
	(pre_truncsti16 GPR:$Rt, GPR:$Rn, am3offset:$offset))]>;
	}



	def STRH_PRE : AI3ldstidx<0b1011, 0, 1, (outs GPR:$Rn_wb),
	(ins GPR:$Rt, addrmode3_pre:$addr), IndexModePre,
	StMiscFrm, IIC_iStore_bh_ru,
	"strh", "\t$Rt, $addr!",
	"$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []> {
	bits<14> addr;
	let Inst{23} = addr{8}; // U bit
	let Inst{22} = addr{13}; // 1 == imm8, 0 == Rm
	let Inst{19-16} = addr{12-9}; // Rn
	let Inst{11-8} = addr{7-4}; // imm7_4/zero
	let Inst{3-0} = addr{3-0}; // imm3_0/Rm
	let DecoderMethod = "DecodeAddrMode3Instruction";
	}

	def STRH_POST : AI3ldstidx<0b1011, 0, 0, (outs GPR:$Rn_wb),
	(ins GPR:$Rt, addr_offset_none:$addr, am3offset:$offset),
	IndexModePost, StMiscFrm, IIC_iStore_bh_ru,
	"strh", "\t$Rt, $addr, $offset",
	"$addr.base = $Rn_wb,@earlyclobber $Rn_wb",
	[(set GPR:$Rn_wb, (post_truncsti16 GPR:$Rt,
	addr_offset_none:$addr,
	am3offset:$offset))]> {
	bits<10> offset;
	bits<4> addr;
	let Inst{23} = offset{8}; // U bit
	let Inst{22} = offset{9}; // 1 == imm8, 0 == Rm
	let Inst{19-16} = addr;
	let Inst{11-8} = offset{7-4}; // imm7_4/zero
	let Inst{3-0} = offset{3-0}; // imm3_0/Rm
	let DecoderMethod = "DecodeAddrMode3Instruction";
	}

	let mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 in {
	def STRD_PRE : AI3ldstidx<0b1111, 0, 1, (outs GPR:$Rn_wb),
	(ins GPR:$Rt, GPR:$Rt2, addrmode3_pre:$addr),
	IndexModePre, StMiscFrm, IIC_iStore_d_ru,
	"strd", "\t$Rt, $Rt2, $addr!",
	"$addr.base = $Rn_wb", []> {
	bits<14> addr;
	let Inst{23} = addr{8}; // U bit
	let Inst{22} = addr{13}; // 1 == imm8, 0 == Rm
	let Inst{19-16} = addr{12-9}; // Rn
	let Inst{11-8} = addr{7-4}; // imm7_4/zero
	let Inst{3-0} = addr{3-0}; // imm3_0/Rm
	let DecoderMethod = "DecodeAddrMode3Instruction";
	}

	def STRD_POST: AI3ldstidx<0b1111, 0, 0, (outs GPR:$Rn_wb),
	(ins GPR:$Rt, GPR:$Rt2, addr_offset_none:$addr,
	am3offset:$offset),
	IndexModePost, StMiscFrm, IIC_iStore_d_ru,
	"strd", "\t$Rt, $Rt2, $addr, $offset",
	"$addr.base = $Rn_wb", []> {
	bits<10> offset;
	bits<4> addr;
	let Inst{23} = offset{8}; // U bit
	let Inst{22} = offset{9}; // 1 == imm8, 0 == Rm
	let Inst{19-16} = addr;
	let Inst{11-8} = offset{7-4}; // imm7_4/zero
	let Inst{3-0} = offset{3-0}; // imm3_0/Rm
	let DecoderMethod = "DecodeAddrMode3Instruction";
	}
	} // mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1

	// STRT, STRBT, and STRHT

	def STRBT_POST_REG : AI2ldstidx<0, 1, 0, (outs GPR:$Rn_wb),
	(ins GPR:$Rt, addr_offset_none:$addr, am2offset_reg:$offset),
	IndexModePost, StFrm, IIC_iStore_bh_ru,
	"strbt", "\t$Rt, $addr, $offset",
	"$addr.base = $Rn_wb", []> {
	// {12} isAdd
	// {11-0} imm12/Rm
	bits<14> offset;
	bits<4> addr;
	let Inst{25} = 1;
	let Inst{23} = offset{12};
	let Inst{21} = 1; // overwrite
	let Inst{19-16} = addr;
	let Inst{11-5} = offset{11-5};
	let Inst{4} = 0;
	let Inst{3-0} = offset{3-0};
	let DecoderMethod = "DecodeAddrMode2IdxInstruction";
	}

	def STRBT_POST_IMM
	: AI2ldstidx<0, 1, 0, (outs GPR:$Rn_wb),
	(ins GPR:$Rt, addr_offset_none:$addr, am2offset_imm:$offset),
	IndexModePost, StFrm, IIC_iStore_bh_ru,
	"strbt", "\t$Rt, $addr, $offset", "$addr.base = $Rn_wb", []> {
	// {12} isAdd
	// {11-0} imm12/Rm
	bits<14> offset;
	bits<4> addr;
	let Inst{25} = 0;
	let Inst{23} = offset{12};
	let Inst{21} = 1; // overwrite
	let Inst{19-16} = addr;
	let Inst{11-0} = offset{11-0};
	let DecoderMethod = "DecodeAddrMode2IdxInstruction";
	}

	def STRBT_POST
	: ARMAsmPseudo<"strbt${q} $Rt, $addr",
	(ins GPR:$Rt, addr_offset_none:$addr, pred:$q)>;

	let mayStore = 1, hasSideEffects = 0 in {
	def STRT_POST_REG : AI2ldstidx<0, 0, 0, (outs GPR:$Rn_wb),
	(ins GPR:$Rt, addr_offset_none:$addr, am2offset_reg:$offset),
	IndexModePost, StFrm, IIC_iStore_ru,
	"strt", "\t$Rt, $addr, $offset",
	"$addr.base = $Rn_wb", []> {
	// {12} isAdd
	// {11-0} imm12/Rm
	bits<14> offset;
	bits<4> addr;
	let Inst{25} = 1;
	let Inst{23} = offset{12};
	let Inst{21} = 1; // overwrite
	let Inst{19-16} = addr;
	let Inst{11-5} = offset{11-5};
	let Inst{4} = 0;
	let Inst{3-0} = offset{3-0};
	let DecoderMethod = "DecodeAddrMode2IdxInstruction";
	}

	def STRT_POST_IMM
	: AI2ldstidx<0, 0, 0, (outs GPR:$Rn_wb),
	(ins GPR:$Rt, addr_offset_none:$addr, am2offset_imm:$offset),
	IndexModePost, StFrm, IIC_iStore_ru,
	"strt", "\t$Rt, $addr, $offset", "$addr.base = $Rn_wb", []> {
	// {12} isAdd
	// {11-0} imm12/Rm
	bits<14> offset;
	bits<4> addr;
	let Inst{25} = 0;
	let Inst{23} = offset{12};
	let Inst{21} = 1; // overwrite
	let Inst{19-16} = addr;
	let Inst{11-0} = offset{11-0};
	let DecoderMethod = "DecodeAddrMode2IdxInstruction";
	}
	}

	def STRT_POST
	: ARMAsmPseudo<"strt${q} $Rt, $addr",
	(ins GPR:$Rt, addr_offset_none:$addr, pred:$q)>;

	multiclass AI3strT<bits<4> op, string opc> {
	def i : AI3ldstidxT<op, 0, (outs GPR:$base_wb),
	(ins GPR:$Rt, addr_offset_none:$addr, postidx_imm8:$offset),
	IndexModePost, StMiscFrm, IIC_iStore_bh_ru, opc,
	"\t$Rt, $addr, $offset", "$addr.base = $base_wb", []> {
	bits<9> offset;
	let Inst{23} = offset{8};
	let Inst{22} = 1;
	let Inst{11-8} = offset{7-4};
	let Inst{3-0} = offset{3-0};
	}
	def r : AI3ldstidxT<op, 0, (outs GPR:$base_wb),
	(ins GPR:$Rt, addr_offset_none:$addr, postidx_reg:$Rm),
	IndexModePost, StMiscFrm, IIC_iStore_bh_ru, opc,
	"\t$Rt, $addr, $Rm", "$addr.base = $base_wb", []> {
	bits<5> Rm;
	let Inst{23} = Rm{4};
	let Inst{22} = 0;
	let Inst{11-8} = 0;
	let Inst{3-0} = Rm{3-0};
	}
	}


	defm STRHT : AI3strT<0b1011, "strht">;

	def STL : AIstrrel<0b00, (outs), (ins GPR:$Rt, addr_offset_none:$addr),
	NoItinerary, "stl", "\t$Rt, $addr", []>;
	def STLB : AIstrrel<0b10, (outs), (ins GPR:$Rt, addr_offset_none:$addr),
	NoItinerary, "stlb", "\t$Rt, $addr", []>;
	def STLH : AIstrrel<0b11, (outs), (ins GPR:$Rt, addr_offset_none:$addr),
	NoItinerary, "stlh", "\t$Rt, $addr", []>;

	//===----------------------------------------------------------------------===//
	// Load / store multiple Instructions.
	//

	multiclass arm_ldst_mult<string asm, string sfx, bit L_bit, bit P_bit, Format f,
	InstrItinClass itin, InstrItinClass itin_upd> {
	// IA is the default, so no need for an explicit suffix on the
	// mnemonic here. Without it is the canonical spelling.
	def IA :
	AXI4<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
	IndexModeNone, f, itin,
	!strconcat(asm, "${p}\t$Rn, $regs", sfx), "", []> {
	let Inst{24-23} = 0b01; // Increment After
	let Inst{22} = P_bit;
	let Inst{21} = 0; // No writeback
	let Inst{20} = L_bit;
	}
	def IA_UPD :
	AXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
	IndexModeUpd, f, itin_upd,
	!strconcat(asm, "${p}\t$Rn!, $regs", sfx), "$Rn = $wb", []> {
	let Inst{24-23} = 0b01; // Increment After
	let Inst{22} = P_bit;
	let Inst{21} = 1; // Writeback
	let Inst{20} = L_bit;

	let DecoderMethod = "DecodeMemMultipleWritebackInstruction";
	}
	def DA :
	AXI4<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
	IndexModeNone, f, itin,
	!strconcat(asm, "da${p}\t$Rn, $regs", sfx), "", []> {
	let Inst{24-23} = 0b00; // Decrement After
	let Inst{22} = P_bit;
	let Inst{21} = 0; // No writeback
	let Inst{20} = L_bit;
	}
	def DA_UPD :
	AXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
	IndexModeUpd, f, itin_upd,
	!strconcat(asm, "da${p}\t$Rn!, $regs", sfx), "$Rn = $wb", []> {
	let Inst{24-23} = 0b00; // Decrement After
	let Inst{22} = P_bit;
	let Inst{21} = 1; // Writeback
	let Inst{20} = L_bit;

	let DecoderMethod = "DecodeMemMultipleWritebackInstruction";
	}
	def DB :
	AXI4<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
	IndexModeNone, f, itin,
	!strconcat(asm, "db${p}\t$Rn, $regs", sfx), "", []> {
	let Inst{24-23} = 0b10; // Decrement Before
	let Inst{22} = P_bit;
	let Inst{21} = 0; // No writeback
	let Inst{20} = L_bit;
	}
	def DB_UPD :
	AXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
	IndexModeUpd, f, itin_upd,
	!strconcat(asm, "db${p}\t$Rn!, $regs", sfx), "$Rn = $wb", []> {
	let Inst{24-23} = 0b10; // Decrement Before
	let Inst{22} = P_bit;
	let Inst{21} = 1; // Writeback
	let Inst{20} = L_bit;

	let DecoderMethod = "DecodeMemMultipleWritebackInstruction";
	}
	def IB :
	AXI4<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
	IndexModeNone, f, itin,
	!strconcat(asm, "ib${p}\t$Rn, $regs", sfx), "", []> {
	let Inst{24-23} = 0b11; // Increment Before
	let Inst{22} = P_bit;
	let Inst{21} = 0; // No writeback
	let Inst{20} = L_bit;
	}
	def IB_UPD :
	AXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
	IndexModeUpd, f, itin_upd,
	!strconcat(asm, "ib${p}\t$Rn!, $regs", sfx), "$Rn = $wb", []> {
	let Inst{24-23} = 0b11; // Increment Before
	let Inst{22} = P_bit;
	let Inst{21} = 1; // Writeback
	let Inst{20} = L_bit;

	let DecoderMethod = "DecodeMemMultipleWritebackInstruction";
	}
	}

	let hasSideEffects = 0 in {

	let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
	defm LDM : arm_ldst_mult<"ldm", "", 1, 0, LdStMulFrm, IIC_iLoad_m,
	IIC_iLoad_mu>, ComplexDeprecationPredicate<"ARMLoad">;

	let mayStore = 1, hasExtraSrcRegAllocReq = 1 in
	defm STM : arm_ldst_mult<"stm", "", 0, 0, LdStMulFrm, IIC_iStore_m,
	IIC_iStore_mu>,
	ComplexDeprecationPredicate<"ARMStore">;

	} // hasSideEffects

	// FIXME: remove when we have a way to marking a MI with these properties.
	// FIXME: Should pc be an implicit operand like PICADD, etc?
	let isReturn = 1, isTerminator = 1, isBarrier = 1, mayLoad = 1,
	hasExtraDefRegAllocReq = 1, isCodeGenOnly = 1 in
	def LDMIA_RET : ARMPseudoExpand<(outs GPR:$wb), (ins GPR:$Rn, pred:$p,
	reglist:$regs, variable_ops),
	4, IIC_iLoad_mBr, [],
	(LDMIA_UPD GPR:$wb, GPR:$Rn, pred:$p, reglist:$regs)>,
	RegConstraint<"$Rn = $wb">;

	let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
	defm sysLDM : arm_ldst_mult<"ldm", " ^", 1, 1, LdStMulFrm, IIC_iLoad_m,
	IIC_iLoad_mu>;

	let mayStore = 1, hasExtraSrcRegAllocReq = 1 in
	defm sysSTM : arm_ldst_mult<"stm", " ^", 0, 1, LdStMulFrm, IIC_iStore_m,
	IIC_iStore_mu>;



	//===----------------------------------------------------------------------===//
	// Move Instructions.
	//

	let hasSideEffects = 0 in
	def MOVr : AsI1<0b1101, (outs GPR:$Rd), (ins GPR:$Rm), DPFrm, IIC_iMOVr,
	"mov", "\t$Rd, $Rm", []>, UnaryDP, Sched<[WriteALU]> {
	bits<4> Rd;
	bits<4> Rm;

	let Inst{19-16} = 0b0000;
	let Inst{11-4} = 0b00000000;
	let Inst{25} = 0;
	let Inst{3-0} = Rm;
	let Inst{15-12} = Rd;
	}

	// A version for the smaller set of tail call registers.
	let hasSideEffects = 0 in
	def MOVr_TC : AsI1<0b1101, (outs tcGPR:$Rd), (ins tcGPR:$Rm), DPFrm,
	IIC_iMOVr, "mov", "\t$Rd, $Rm", []>, UnaryDP, Sched<[WriteALU]> {
	bits<4> Rd;
	bits<4> Rm;

	let Inst{11-4} = 0b00000000;
	let Inst{25} = 0;
	let Inst{3-0} = Rm;
	let Inst{15-12} = Rd;
	}

	def MOVsr : AsI1<0b1101, (outs GPRnopc:$Rd), (ins shift_so_reg_reg:$src),
	DPSoRegRegFrm, IIC_iMOVsr,
	"mov", "\t$Rd, $src",
	[(set GPRnopc:$Rd, shift_so_reg_reg:$src)]>, UnaryDP,
	Sched<[WriteALU]> {
	bits<4> Rd;
	bits<12> src;
	let Inst{15-12} = Rd;
	let Inst{19-16} = 0b0000;
	let Inst{11-8} = src{11-8};
	let Inst{7} = 0;
	let Inst{6-5} = src{6-5};
	let Inst{4} = 1;
	let Inst{3-0} = src{3-0};
	let Inst{25} = 0;
	}

	def MOVsi : AsI1<0b1101, (outs GPR:$Rd), (ins shift_so_reg_imm:$src),
	DPSoRegImmFrm, IIC_iMOVsr,
	"mov", "\t$Rd, $src", [(set GPR:$Rd, shift_so_reg_imm:$src)]>,
	UnaryDP, Sched<[WriteALU]> {
	bits<4> Rd;
	bits<12> src;
	let Inst{15-12} = Rd;
	let Inst{19-16} = 0b0000;
	let Inst{11-5} = src{11-5};
	let Inst{4} = 0;
	let Inst{3-0} = src{3-0};
	let Inst{25} = 0;
	}

	let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in
	def MOVi : AsI1<0b1101, (outs GPR:$Rd), (ins mod_imm:$imm), DPFrm, IIC_iMOVi,
	"mov", "\t$Rd, $imm", [(set GPR:$Rd, mod_imm:$imm)]>, UnaryDP,
	Sched<[WriteALU]> {
	bits<4> Rd;
	bits<12> imm;
	let Inst{25} = 1;
	let Inst{15-12} = Rd;
	let Inst{19-16} = 0b0000;
	let Inst{11-0} = imm;
	}

	let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in
	def MOVi16 : AI1<0b1000, (outs GPR:$Rd), (ins imm0_65535_expr:$imm),
	DPFrm, IIC_iMOVi,
	"movw", "\t$Rd, $imm",
	[(set GPR:$Rd, imm0_65535:$imm)]>,
	Requires<[IsARM, HasV6T2]>, UnaryDP, Sched<[WriteALU]> {
	bits<4> Rd;
	bits<16> imm;
	let Inst{15-12} = Rd;
	let Inst{11-0} = imm{11-0};
	let Inst{19-16} = imm{15-12};
	let Inst{20} = 0;
	let Inst{25} = 1;
	let DecoderMethod = "DecodeArmMOVTWInstruction";
	}

	def : InstAlias<"mov${p} $Rd, $imm",
	(MOVi16 GPR:$Rd, imm0_65535_expr:$imm, pred:$p), 0>,
	Requires<[IsARM, HasV6T2]>;

	def MOVi16_ga_pcrel : PseudoInst<(outs GPR:$Rd),
	(ins i32imm:$addr, pclabel:$id), IIC_iMOVi, []>,
	Sched<[WriteALU]>;

	let Constraints = "$src = $Rd" in {
	def MOVTi16 : AI1<0b1010, (outs GPRnopc:$Rd),
	(ins GPR:$src, imm0_65535_expr:$imm),
	DPFrm, IIC_iMOVi,
	"movt", "\t$Rd, $imm",
	[(set GPRnopc:$Rd,
	(or (and GPR:$src, 0xffff),
	lo16AllZero:$imm))]>, UnaryDP,
	Requires<[IsARM, HasV6T2]>, Sched<[WriteALU]> {
	bits<4> Rd;
	bits<16> imm;
	let Inst{15-12} = Rd;
	let Inst{11-0} = imm{11-0};
	let Inst{19-16} = imm{15-12};
	let Inst{20} = 0;
	let Inst{25} = 1;
	let DecoderMethod = "DecodeArmMOVTWInstruction";
	}

	def MOVTi16_ga_pcrel : PseudoInst<(outs GPR:$Rd),
	(ins GPR:$src, i32imm:$addr, pclabel:$id), IIC_iMOVi, []>,
	Sched<[WriteALU]>;

	} // Constraints

	def : ARMPat<(or GPR:$src, 0xffff0000), (MOVTi16 GPR:$src, 0xffff)>,
	Requires<[IsARM, HasV6T2]>;

	let Uses = [CPSR] in
	def RRX: PseudoInst<(outs GPR:$Rd), (ins GPR:$Rm), IIC_iMOVsi,
	[(set GPR:$Rd, (ARMrrx GPR:$Rm))]>, UnaryDP,
	Requires<[IsARM]>, Sched<[WriteALU]>;

	// These aren't really mov instructions, but we have to define them this way
	// due to flag operands.

	let Defs = [CPSR] in {
	def MOVsrl_flag : PseudoInst<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVsi,
	[(set GPR:$dst, (ARMsrl_flag GPR:$src))]>, UnaryDP,
	Sched<[WriteALU]>, Requires<[IsARM]>;
	def MOVsra_flag : PseudoInst<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVsi,
	[(set GPR:$dst, (ARMsra_flag GPR:$src))]>, UnaryDP,
	Sched<[WriteALU]>, Requires<[IsARM]>;
	}

	//===----------------------------------------------------------------------===//
	// Extend Instructions.
	//

	// Sign extenders

	def SXTB : AI_ext_rrot<0b01101010,
	"sxtb", UnOpFrag<(sext_inreg node:$Src, i8)>>;
	def SXTH : AI_ext_rrot<0b01101011,
	"sxth", UnOpFrag<(sext_inreg node:$Src, i16)>>;

	def SXTAB : AI_exta_rrot<0b01101010,
	"sxtab", BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS, i8))>>;
	def SXTAH : AI_exta_rrot<0b01101011,
	"sxtah", BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS,i16))>>;

	def : ARMV6Pat<(add rGPR:$Rn, (sext_inreg (srl rGPR:$Rm, rot_imm:$rot), i8)),
	(SXTAB rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>;
	def : ARMV6Pat<(add rGPR:$Rn, (sext_inreg (srl rGPR:$Rm, imm8_or_16:$rot),
	i16)),
	(SXTAH rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>;

	def SXTB16 : AI_ext_rrot_np<0b01101000, "sxtb16">;
	def : ARMV6Pat<(int_arm_sxtb16 GPR:$Src),
	(SXTB16 GPR:$Src, 0)>;

	def SXTAB16 : AI_exta_rrot_np<0b01101000, "sxtab16">;
	def : ARMV6Pat<(int_arm_sxtab16 GPR:$LHS, GPR:$RHS),
	(SXTAB16 GPR:$LHS, GPR:$RHS, 0)>;

	// Zero extenders

	let AddedComplexity = 16 in {
	def UXTB : AI_ext_rrot<0b01101110,
	"uxtb" , UnOpFrag<(and node:$Src, 0x000000FF)>>;
	def UXTH : AI_ext_rrot<0b01101111,
	"uxth" , UnOpFrag<(and node:$Src, 0x0000FFFF)>>;
	def UXTB16 : AI_ext_rrot<0b01101100,
	"uxtb16", UnOpFrag<(and node:$Src, 0x00FF00FF)>>;

	// FIXME: This pattern incorrectly assumes the shl operator is a rotate.
	// The transformation should probably be done as a combiner action
	// instead so we can include a check for masking back in the upper
	// eight bits of the source into the lower eight bits of the result.
	//def : ARMV6Pat<(and (shl GPR:$Src, (i32 8)), 0xFF00FF),
	// (UXTB16r_rot GPR:$Src, 3)>;
	def : ARMV6Pat<(and (srl GPR:$Src, (i32 8)), 0xFF00FF),
	(UXTB16 GPR:$Src, 1)>;
	def : ARMV6Pat<(int_arm_uxtb16 GPR:$Src),
	(UXTB16 GPR:$Src, 0)>;

	def UXTAB : AI_exta_rrot<0b01101110, "uxtab",
	BinOpFrag<(add node:$LHS, (and node:$RHS, 0x00FF))>>;
	def UXTAH : AI_exta_rrot<0b01101111, "uxtah",
	BinOpFrag<(add node:$LHS, (and node:$RHS, 0xFFFF))>>;

	def : ARMV6Pat<(add rGPR:$Rn, (and (srl rGPR:$Rm, rot_imm:$rot), 0xFF)),
	(UXTAB rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>;
	def : ARMV6Pat<(add rGPR:$Rn, (and (srl rGPR:$Rm, imm8_or_16:$rot), 0xFFFF)),
	(UXTAH rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>;
	}

	// This isn't safe in general, the add is two 16-bit units, not a 32-bit add.
	def UXTAB16 : AI_exta_rrot_np<0b01101100, "uxtab16">;
	def : ARMV6Pat<(int_arm_uxtab16 GPR:$LHS, GPR:$RHS),
	(UXTAB16 GPR:$LHS, GPR:$RHS, 0)>;


	def SBFX : I<(outs GPRnopc:$Rd),
	(ins GPRnopc:$Rn, imm0_31:$lsb, imm1_32:$width),
	AddrMode1, 4, IndexModeNone, DPFrm, IIC_iUNAsi,
	"sbfx", "\t$Rd, $Rn, $lsb, $width", "", []>,
	Requires<[IsARM, HasV6T2]> {
	bits<4> Rd;
	bits<4> Rn;
	bits<5> lsb;
	bits<5> width;
	let Inst{27-21} = 0b0111101;
	let Inst{6-4} = 0b101;
	let Inst{20-16} = width;
	let Inst{15-12} = Rd;
	let Inst{11-7} = lsb;
	let Inst{3-0} = Rn;
	}

	def UBFX : I<(outs GPRnopc:$Rd),
	(ins GPRnopc:$Rn, imm0_31:$lsb, imm1_32:$width),
	AddrMode1, 4, IndexModeNone, DPFrm, IIC_iUNAsi,
	"ubfx", "\t$Rd, $Rn, $lsb, $width", "", []>,
	Requires<[IsARM, HasV6T2]> {
	bits<4> Rd;
	bits<4> Rn;
	bits<5> lsb;
	bits<5> width;
	let Inst{27-21} = 0b0111111;
	let Inst{6-4} = 0b101;
	let Inst{20-16} = width;
	let Inst{15-12} = Rd;
	let Inst{11-7} = lsb;
	let Inst{3-0} = Rn;
	}

	//===----------------------------------------------------------------------===//
	// Arithmetic Instructions.
	//

	let isAdd = 1 in
	defm ADD : AsI1_bin_irs<0b0100, "add",
	IIC_iALUi, IIC_iALUr, IIC_iALUsr, add, 1>;
	defm SUB : AsI1_bin_irs<0b0010, "sub",
	IIC_iALUi, IIC_iALUr, IIC_iALUsr, sub>;

	// ADD and SUB with 's' bit set.
	//
	// Currently, ADDS/SUBS are pseudo opcodes that exist only in the
	// selection DAG. They are "lowered" to real ADD/SUB opcodes by
	// AdjustInstrPostInstrSelection where we determine whether or not to
	// set the "s" bit based on CPSR liveness.
	//
	// FIXME: Eliminate ADDS/SUBS pseudo opcodes after adding tablegen
	// support for an optional CPSR definition that corresponds to the DAG
	// node's second value. We can then eliminate the implicit def of CPSR.
	let isAdd = 1 in
	defm ADDS : AsI1_bin_s_irs<IIC_iALUi, IIC_iALUr, IIC_iALUsr, ARMaddc, 1>;
	defm SUBS : AsI1_bin_s_irs<IIC_iALUi, IIC_iALUr, IIC_iALUsr, ARMsubc>;

	let isAdd = 1 in
	defm ADC : AI1_adde_sube_irs<0b0101, "adc", ARMadde, 1>;
	defm SBC : AI1_adde_sube_irs<0b0110, "sbc", ARMsube>;

	defm RSB : AsI1_rbin_irs<0b0011, "rsb",
	IIC_iALUi, IIC_iALUr, IIC_iALUsr,
	sub>;

	// FIXME: Eliminate them if we can write def : Pat patterns which defines
	// CPSR and the implicit def of CPSR is not needed.
	defm RSBS : AsI1_rbin_s_is<IIC_iALUi, IIC_iALUr, IIC_iALUsr, ARMsubc>;

	defm RSC : AI1_rsc_irs<0b0111, "rsc", ARMsube>;

	// (sub X, imm) gets canonicalized to (add X, -imm). Match this form.
	// The assume-no-carry-in form uses the negation of the input since add/sub
	// assume opposite meanings of the carry flag (i.e., carry == !borrow).
	// See the definition of AddWithCarry() in the ARM ARM A2.2.1 for the gory
	// details.
	def : ARMPat<(add GPR:$src, mod_imm_neg:$imm),
	(SUBri GPR:$src, mod_imm_neg:$imm)>;
	def : ARMPat<(ARMaddc GPR:$src, mod_imm_neg:$imm),
	(SUBSri GPR:$src, mod_imm_neg:$imm)>;

	def : ARMPat<(add GPR:$src, imm0_65535_neg:$imm),
	(SUBrr GPR:$src, (MOVi16 (imm_neg_XFORM imm:$imm)))>,
	Requires<[IsARM, HasV6T2]>;
	def : ARMPat<(ARMaddc GPR:$src, imm0_65535_neg:$imm),
	(SUBSrr GPR:$src, (MOVi16 (imm_neg_XFORM imm:$imm)))>,
	Requires<[IsARM, HasV6T2]>;

	// The with-carry-in form matches bitwise not instead of the negation.
	// Effectively, the inverse interpretation of the carry flag already accounts
	// for part of the negation.
	def : ARMPat<(ARMadde GPR:$src, mod_imm_not:$imm, CPSR),
	(SBCri GPR:$src, mod_imm_not:$imm)>;
	def : ARMPat<(ARMadde GPR:$src, imm0_65535_neg:$imm, CPSR),
	(SBCrr GPR:$src, (MOVi16 (imm_not_XFORM imm:$imm)))>,
	Requires<[IsARM, HasV6T2]>;

	// Note: These are implemented in C++ code, because they have to generate
	// ADD/SUBrs instructions, which use a complex pattern that a xform function
	// cannot produce.
	// (mul X, 2^n+1) -> (add (X << n), X)
	// (mul X, 2^n-1) -> (rsb X, (X << n))

	// ARM Arithmetic Instruction
	// GPR:$dst = GPR:$a op GPR:$b
	class AAI<bits<8> op27_20, bits<8> op11_4, string opc,
	list<dag> pattern = [],
	dag iops = (ins GPRnopc:$Rn, GPRnopc:$Rm),
	string asm = "\t$Rd, $Rn, $Rm">
	: AI<(outs GPRnopc:$Rd), iops, DPFrm, IIC_iALUr, opc, asm, pattern>,
	Sched<[WriteALU, ReadALU, ReadALU]> {
	bits<4> Rn;
	bits<4> Rd;
	bits<4> Rm;
	let Inst{27-20} = op27_20;
	let Inst{11-4} = op11_4;
	let Inst{19-16} = Rn;
	let Inst{15-12} = Rd;
	let Inst{3-0} = Rm;

	let Unpredictable{11-8} = 0b1111;
	}

	// Wrappers around the AAI class
	class AAIRevOpr<bits<8> op27_20, bits<8> op11_4, string opc,
	list<dag> pattern = []>
	: AAI<op27_20, op11_4, opc,
	pattern,
	(ins GPRnopc:$Rm, GPRnopc:$Rn),
	"\t$Rd, $Rm, $Rn">;

	class AAIIntrinsic<bits<8> op27_20, bits<8> op11_4, string opc,
	Intrinsic intrinsic>
	: AAI<op27_20, op11_4, opc,
	[(set GPRnopc:$Rd, (intrinsic GPRnopc:$Rn, GPRnopc:$Rm))]>;

	// Saturating add/subtract
	let hasSideEffects = 1 in {
	def QADD8 : AAIIntrinsic<0b01100010, 0b11111001, "qadd8", int_arm_qadd8>;
	def QADD16 : AAIIntrinsic<0b01100010, 0b11110001, "qadd16", int_arm_qadd16>;
	def QSUB16 : AAIIntrinsic<0b01100010, 0b11110111, "qsub16", int_arm_qsub16>;
	def QSUB8 : AAIIntrinsic<0b01100010, 0b11111111, "qsub8", int_arm_qsub8>;

	def QDADD : AAIRevOpr<0b00010100, 0b00000101, "qdadd",
	[(set GPRnopc:$Rd, (int_arm_qadd (int_arm_qadd GPRnopc:$Rm,
	GPRnopc:$Rm),
	GPRnopc:$Rn))]>;
	def QDSUB : AAIRevOpr<0b00010110, 0b00000101, "qdsub",
	[(set GPRnopc:$Rd, (int_arm_qsub GPRnopc:$Rm,
	(int_arm_qadd GPRnopc:$Rn, GPRnopc:$Rn)))]>;
	def QSUB : AAIRevOpr<0b00010010, 0b00000101, "qsub",
	[(set GPRnopc:$Rd, (int_arm_qsub GPRnopc:$Rm, GPRnopc:$Rn))]>;
	let DecoderMethod = "DecodeQADDInstruction" in
	def QADD : AAIRevOpr<0b00010000, 0b00000101, "qadd",
	[(set GPRnopc:$Rd, (int_arm_qadd GPRnopc:$Rm, GPRnopc:$Rn))]>;
	}

	def UQADD16 : AAIIntrinsic<0b01100110, 0b11110001, "uqadd16", int_arm_uqadd16>;
	def UQADD8 : AAIIntrinsic<0b01100110, 0b11111001, "uqadd8", int_arm_uqadd8>;
	def UQSUB16 : AAIIntrinsic<0b01100110, 0b11110111, "uqsub16", int_arm_uqsub16>;
	def UQSUB8 : AAIIntrinsic<0b01100110, 0b11111111, "uqsub8", int_arm_uqsub8>;
	def QASX : AAIIntrinsic<0b01100010, 0b11110011, "qasx", int_arm_qasx>;
	def QSAX : AAIIntrinsic<0b01100010, 0b11110101, "qsax", int_arm_qsax>;
	def UQASX : AAIIntrinsic<0b01100110, 0b11110011, "uqasx", int_arm_uqasx>;
	def UQSAX : AAIIntrinsic<0b01100110, 0b11110101, "uqsax", int_arm_uqsax>;

	// Signed/Unsigned add/subtract

	def SASX : AAIIntrinsic<0b01100001, 0b11110011, "sasx", int_arm_sasx>;
	def SADD16 : AAIIntrinsic<0b01100001, 0b11110001, "sadd16", int_arm_sadd16>;
	def SADD8 : AAIIntrinsic<0b01100001, 0b11111001, "sadd8", int_arm_sadd8>;
	def SSAX : AAIIntrinsic<0b01100001, 0b11110101, "ssax", int_arm_ssax>;
	def SSUB16 : AAIIntrinsic<0b01100001, 0b11110111, "ssub16", int_arm_ssub16>;
	def SSUB8 : AAIIntrinsic<0b01100001, 0b11111111, "ssub8", int_arm_ssub8>;
	def UASX : AAIIntrinsic<0b01100101, 0b11110011, "uasx", int_arm_uasx>;
	def UADD16 : AAIIntrinsic<0b01100101, 0b11110001, "uadd16", int_arm_uadd16>;
	def UADD8 : AAIIntrinsic<0b01100101, 0b11111001, "uadd8", int_arm_uadd8>;
	def USAX : AAIIntrinsic<0b01100101, 0b11110101, "usax", int_arm_usax>;
	def USUB16 : AAIIntrinsic<0b01100101, 0b11110111, "usub16", int_arm_usub16>;
	def USUB8 : AAIIntrinsic<0b01100101, 0b11111111, "usub8", int_arm_usub8>;

	// Signed/Unsigned halving add/subtract

	def SHASX : AAIIntrinsic<0b01100011, 0b11110011, "shasx", int_arm_shasx>;
	def SHADD16 : AAIIntrinsic<0b01100011, 0b11110001, "shadd16", int_arm_shadd16>;
	def SHADD8 : AAIIntrinsic<0b01100011, 0b11111001, "shadd8", int_arm_shadd8>;
	def SHSAX : AAIIntrinsic<0b01100011, 0b11110101, "shsax", int_arm_shsax>;
	def SHSUB16 : AAIIntrinsic<0b01100011, 0b11110111, "shsub16", int_arm_shsub16>;
	def SHSUB8 : AAIIntrinsic<0b01100011, 0b11111111, "shsub8", int_arm_shsub8>;
	def UHASX : AAIIntrinsic<0b01100111, 0b11110011, "uhasx", int_arm_uhasx>;
	def UHADD16 : AAIIntrinsic<0b01100111, 0b11110001, "uhadd16", int_arm_uhadd16>;
	def UHADD8 : AAIIntrinsic<0b01100111, 0b11111001, "uhadd8", int_arm_uhadd8>;
	def UHSAX : AAIIntrinsic<0b01100111, 0b11110101, "uhsax", int_arm_uhsax>;
	def UHSUB16 : AAIIntrinsic<0b01100111, 0b11110111, "uhsub16", int_arm_uhsub16>;
	def UHSUB8 : AAIIntrinsic<0b01100111, 0b11111111, "uhsub8", int_arm_uhsub8>;

	// Unsigned Sum of Absolute Differences [and Accumulate].

	def USAD8 : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
	MulFrm /* for convenience */, NoItinerary, "usad8",
	"\t$Rd, $Rn, $Rm",
	[(set GPR:$Rd, (int_arm_usad8 GPR:$Rn, GPR:$Rm))]>,
	Requires<[IsARM, HasV6]>, Sched<[WriteALU, ReadALU, ReadALU]> {
	bits<4> Rd;
	bits<4> Rn;
	bits<4> Rm;
	let Inst{27-20} = 0b01111000;
	let Inst{15-12} = 0b1111;
	let Inst{7-4} = 0b0001;
	let Inst{19-16} = Rd;
	let Inst{11-8} = Rm;
	let Inst{3-0} = Rn;
	}
	def USADA8 : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
	MulFrm /* for convenience */, NoItinerary, "usada8",
	"\t$Rd, $Rn, $Rm, $Ra",
	[(set GPR:$Rd, (int_arm_usada8 GPR:$Rn, GPR:$Rm, GPR:$Ra))]>,
	Requires<[IsARM, HasV6]>, Sched<[WriteALU, ReadALU, ReadALU]>{
	bits<4> Rd;
	bits<4> Rn;
	bits<4> Rm;
	bits<4> Ra;
	let Inst{27-20} = 0b01111000;
	let Inst{7-4} = 0b0001;
	let Inst{19-16} = Rd;
	let Inst{15-12} = Ra;
	let Inst{11-8} = Rm;
	let Inst{3-0} = Rn;
	}

	// Signed/Unsigned saturate
	def SSAT : AI<(outs GPRnopc:$Rd),
	(ins imm1_32:$sat_imm, GPRnopc:$Rn, shift_imm:$sh),
	SatFrm, NoItinerary, "ssat", "\t$Rd, $sat_imm, $Rn$sh", []>,
	Requires<[IsARM,HasV6]>{
	bits<4> Rd;
	bits<5> sat_imm;
	bits<4> Rn;
	bits<8> sh;
	let Inst{27-21} = 0b0110101;
	let Inst{5-4} = 0b01;
	let Inst{20-16} = sat_imm;
	let Inst{15-12} = Rd;
	let Inst{11-7} = sh{4-0};
	let Inst{6} = sh{5};
	let Inst{3-0} = Rn;
	}

	def SSAT16 : AI<(outs GPRnopc:$Rd),
	(ins imm1_16:$sat_imm, GPRnopc:$Rn), SatFrm,
	NoItinerary, "ssat16", "\t$Rd, $sat_imm, $Rn", []>,
	Requires<[IsARM,HasV6]>{
	bits<4> Rd;
	bits<4> sat_imm;
	bits<4> Rn;
	let Inst{27-20} = 0b01101010;
	let Inst{11-4} = 0b11110011;
	let Inst{15-12} = Rd;
	let Inst{19-16} = sat_imm;
	let Inst{3-0} = Rn;
	}

	def USAT : AI<(outs GPRnopc:$Rd),
	(ins imm0_31:$sat_imm, GPRnopc:$Rn, shift_imm:$sh),
	SatFrm, NoItinerary, "usat", "\t$Rd, $sat_imm, $Rn$sh", []>,
	Requires<[IsARM,HasV6]> {
	bits<4> Rd;
	bits<5> sat_imm;
	bits<4> Rn;
	bits<8> sh;
	let Inst{27-21} = 0b0110111;
	let Inst{5-4} = 0b01;
	let Inst{15-12} = Rd;
	let Inst{11-7} = sh{4-0};
	let Inst{6} = sh{5};
	let Inst{20-16} = sat_imm;
	let Inst{3-0} = Rn;
	}

	def USAT16 : AI<(outs GPRnopc:$Rd),
	(ins imm0_15:$sat_imm, GPRnopc:$Rn), SatFrm,
	NoItinerary, "usat16", "\t$Rd, $sat_imm, $Rn", []>,
	Requires<[IsARM,HasV6]>{
	bits<4> Rd;
	bits<4> sat_imm;
	bits<4> Rn;
	let Inst{27-20} = 0b01101110;
	let Inst{11-4} = 0b11110011;
	let Inst{15-12} = Rd;
	let Inst{19-16} = sat_imm;
	let Inst{3-0} = Rn;
	}

	def : ARMV6Pat<(int_arm_ssat GPRnopc:$a, imm1_32:$pos),
	(SSAT imm1_32:$pos, GPRnopc:$a, 0)>;
	def : ARMV6Pat<(int_arm_usat GPRnopc:$a, imm0_31:$pos),
	(USAT imm0_31:$pos, GPRnopc:$a, 0)>;
	def : ARMPat<(ARMssatnoshift GPRnopc:$Rn, imm0_31:$imm),
	(SSAT imm0_31:$imm, GPRnopc:$Rn, 0)>;
	def : ARMV6Pat<(int_arm_ssat16 GPRnopc:$a, imm1_16:$pos),
	(SSAT16 imm1_16:$pos, GPRnopc:$a)>;
	def : ARMV6Pat<(int_arm_usat16 GPRnopc:$a, imm0_15:$pos),
	(USAT16 imm0_15:$pos, GPRnopc:$a)>;

	//===----------------------------------------------------------------------===//
	// Bitwise Instructions.
	//

	defm AND : AsI1_bin_irs<0b0000, "and",
	IIC_iBITi, IIC_iBITr, IIC_iBITsr, and, 1>;
	defm ORR : AsI1_bin_irs<0b1100, "orr",
	IIC_iBITi, IIC_iBITr, IIC_iBITsr, or, 1>;
	defm EOR : AsI1_bin_irs<0b0001, "eor",
	IIC_iBITi, IIC_iBITr, IIC_iBITsr, xor, 1>;
	defm BIC : AsI1_bin_irs<0b1110, "bic",
	IIC_iBITi, IIC_iBITr, IIC_iBITsr,
	BinOpFrag<(and node:$LHS, (not node:$RHS))>>;

	// FIXME: bf_inv_mask_imm should be two operands, the lsb and the msb, just
	// like in the actual instruction encoding. The complexity of mapping the mask
	// to the lsb/msb pair should be handled by ISel, not encapsulated in the
	// instruction description.
	def BFC : I<(outs GPR:$Rd), (ins GPR:$src, bf_inv_mask_imm:$imm),
	AddrMode1, 4, IndexModeNone, DPFrm, IIC_iUNAsi,
	"bfc", "\t$Rd, $imm", "$src = $Rd",
	[(set GPR:$Rd, (and GPR:$src, bf_inv_mask_imm:$imm))]>,
	Requires<[IsARM, HasV6T2]> {
	bits<4> Rd;
	bits<10> imm;
	let Inst{27-21} = 0b0111110;
	let Inst{6-0} = 0b0011111;
	let Inst{15-12} = Rd;
	let Inst{11-7} = imm{4-0}; // lsb
	let Inst{20-16} = imm{9-5}; // msb
	}

	// A8.6.18 BFI - Bitfield insert (Encoding A1)
	def BFI:I<(outs GPRnopc:$Rd), (ins GPRnopc:$src, GPR:$Rn, bf_inv_mask_imm:$imm),
	AddrMode1, 4, IndexModeNone, DPFrm, IIC_iUNAsi,
	"bfi", "\t$Rd, $Rn, $imm", "$src = $Rd",
	[(set GPRnopc:$Rd, (ARMbfi GPRnopc:$src, GPR:$Rn,
	bf_inv_mask_imm:$imm))]>,
	Requires<[IsARM, HasV6T2]> {
	bits<4> Rd;
	bits<4> Rn;
	bits<10> imm;
	let Inst{27-21} = 0b0111110;
	let Inst{6-4} = 0b001; // Rn: Inst{3-0} != 15
	let Inst{15-12} = Rd;
	let Inst{11-7} = imm{4-0}; // lsb
	let Inst{20-16} = imm{9-5}; // width
	let Inst{3-0} = Rn;
	}

	def MVNr : AsI1<0b1111, (outs GPR:$Rd), (ins GPR:$Rm), DPFrm, IIC_iMVNr,
	"mvn", "\t$Rd, $Rm",
	[(set GPR:$Rd, (not GPR:$Rm))]>, UnaryDP, Sched<[WriteALU]> {
	bits<4> Rd;
	bits<4> Rm;
	let Inst{25} = 0;
	let Inst{19-16} = 0b0000;
	let Inst{11-4} = 0b00000000;
	let Inst{15-12} = Rd;
	let Inst{3-0} = Rm;
	}
	def MVNsi : AsI1<0b1111, (outs GPR:$Rd), (ins so_reg_imm:$shift),
	DPSoRegImmFrm, IIC_iMVNsr, "mvn", "\t$Rd, $shift",
	[(set GPR:$Rd, (not so_reg_imm:$shift))]>, UnaryDP,
	Sched<[WriteALU]> {
	bits<4> Rd;
	bits<12> shift;
	let Inst{25} = 0;
	let Inst{19-16} = 0b0000;
	let Inst{15-12} = Rd;
	let Inst{11-5} = shift{11-5};
	let Inst{4} = 0;
	let Inst{3-0} = shift{3-0};
	}
	def MVNsr : AsI1<0b1111, (outs GPR:$Rd), (ins so_reg_reg:$shift),
	DPSoRegRegFrm, IIC_iMVNsr, "mvn", "\t$Rd, $shift",
	[(set GPR:$Rd, (not so_reg_reg:$shift))]>, UnaryDP,
	Sched<[WriteALU]> {
	bits<4> Rd;
	bits<12> shift;
	let Inst{25} = 0;
	let Inst{19-16} = 0b0000;
	let Inst{15-12} = Rd;
	let Inst{11-8} = shift{11-8};
	let Inst{7} = 0;
	let Inst{6-5} = shift{6-5};
	let Inst{4} = 1;
	let Inst{3-0} = shift{3-0};
	}
	let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in
	def MVNi : AsI1<0b1111, (outs GPR:$Rd), (ins mod_imm:$imm), DPFrm,
	IIC_iMVNi, "mvn", "\t$Rd, $imm",
	[(set GPR:$Rd, mod_imm_not:$imm)]>,UnaryDP, Sched<[WriteALU]> {
	bits<4> Rd;
	bits<12> imm;
	let Inst{25} = 1;
	let Inst{19-16} = 0b0000;
	let Inst{15-12} = Rd;
	let Inst{11-0} = imm;
	}

	let AddedComplexity = 1 in
	def : ARMPat<(and GPR:$src, mod_imm_not:$imm),
	(BICri GPR:$src, mod_imm_not:$imm)>;

	//===----------------------------------------------------------------------===//
	// Multiply Instructions.
	//
	class AsMul1I32<bits<7> opcod, dag oops, dag iops, InstrItinClass itin,
	string opc, string asm, list<dag> pattern>
	: AsMul1I<opcod, oops, iops, itin, opc, asm, pattern> {
	bits<4> Rd;
	bits<4> Rm;
	bits<4> Rn;
	let Inst{19-16} = Rd;
	let Inst{11-8} = Rm;
	let Inst{3-0} = Rn;
	}
	class AsMul1I64<bits<7> opcod, dag oops, dag iops, InstrItinClass itin,
	string opc, string asm, list<dag> pattern>
	: AsMul1I<opcod, oops, iops, itin, opc, asm, pattern> {
	bits<4> RdLo;
	bits<4> RdHi;
	bits<4> Rm;
	bits<4> Rn;
	let Inst{19-16} = RdHi;
	let Inst{15-12} = RdLo;
	let Inst{11-8} = Rm;
	let Inst{3-0} = Rn;
	}
	class AsMla1I64<bits<7> opcod, dag oops, dag iops, InstrItinClass itin,
	string opc, string asm, list<dag> pattern>
	: AsMul1I<opcod, oops, iops, itin, opc, asm, pattern> {
	bits<4> RdLo;
	bits<4> RdHi;
	bits<4> Rm;
	bits<4> Rn;
	let Inst{19-16} = RdHi;
	let Inst{15-12} = RdLo;
	let Inst{11-8} = Rm;
	let Inst{3-0} = Rn;
	}

	// FIXME: The v5 pseudos are only necessary for the additional Constraint
	// property. Remove them when it's possible to add those properties
	// on an individual MachineInstr, not just an instruction description.
	let isCommutable = 1, TwoOperandAliasConstraint = "$Rn = $Rd" in {
	def MUL : AsMul1I32<0b0000000, (outs GPRnopc:$Rd),
	(ins GPRnopc:$Rn, GPRnopc:$Rm),
	IIC_iMUL32, "mul", "\t$Rd, $Rn, $Rm",
	[(set GPRnopc:$Rd, (mul GPRnopc:$Rn, GPRnopc:$Rm))]>,
	Requires<[IsARM, HasV6]>,
	Sched<[WriteMUL32, ReadMUL, ReadMUL]> {
	let Inst{15-12} = 0b0000;
	let Unpredictable{15-12} = 0b1111;
	}

	let Constraints = "@earlyclobber $Rd" in
	def MULv5: ARMPseudoExpand<(outs GPRnopc:$Rd), (ins GPRnopc:$Rn, GPRnopc:$Rm,
	pred:$p, cc_out:$s),
	4, IIC_iMUL32,
	[(set GPRnopc:$Rd, (mul GPRnopc:$Rn, GPRnopc:$Rm))],
	(MUL GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p, cc_out:$s)>,
	Requires<[IsARM, NoV6, UseMulOps]>,
	Sched<[WriteMUL32, ReadMUL, ReadMUL]>;
	}

	def MLA : AsMul1I32<0b0000001, (outs GPRnopc:$Rd),
	(ins GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$Ra),
	IIC_iMAC32, "mla", "\t$Rd, $Rn, $Rm, $Ra",
	[(set GPRnopc:$Rd, (add (mul GPRnopc:$Rn, GPRnopc:$Rm), GPRnopc:$Ra))]>,
	Requires<[IsARM, HasV6, UseMulOps]>,
	Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]> {
	bits<4> Ra;
	let Inst{15-12} = Ra;
	}

	let Constraints = "@earlyclobber $Rd" in
	def MLAv5: ARMPseudoExpand<(outs GPRnopc:$Rd),
	(ins GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$Ra,
	pred:$p, cc_out:$s), 4, IIC_iMAC32,
	[(set GPRnopc:$Rd, (add (mul GPRnopc:$Rn, GPRnopc:$Rm), GPRnopc:$Ra))],
	(MLA GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$Ra, pred:$p, cc_out:$s)>,
	Requires<[IsARM, NoV6]>,
	Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]>;

	def MLS : AMul1I<0b0000011, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
	IIC_iMAC32, "mls", "\t$Rd, $Rn, $Rm, $Ra",
	[(set GPR:$Rd, (sub GPR:$Ra, (mul GPR:$Rn, GPR:$Rm)))]>,
	Requires<[IsARM, HasV6T2, UseMulOps]>,
	Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]> {
	bits<4> Rd;
	bits<4> Rm;
	bits<4> Rn;
	bits<4> Ra;
	let Inst{19-16} = Rd;
	let Inst{15-12} = Ra;
	let Inst{11-8} = Rm;
	let Inst{3-0} = Rn;
	}

	// Extra precision multiplies with low / high results
	let hasSideEffects = 0 in {
	let isCommutable = 1 in {
	def SMULL : AsMul1I64<0b0000110, (outs GPR:$RdLo, GPR:$RdHi),
	(ins GPR:$Rn, GPR:$Rm), IIC_iMUL64,
	"smull", "\t$RdLo, $RdHi, $Rn, $Rm",
	[(set GPR:$RdLo, GPR:$RdHi,
	(smullohi GPR:$Rn, GPR:$Rm))]>,
	Requires<[IsARM, HasV6]>,
	Sched<[WriteMUL64Lo, WriteMUL64Hi, ReadMUL, ReadMUL]>;

	def UMULL : AsMul1I64<0b0000100, (outs GPR:$RdLo, GPR:$RdHi),
	(ins GPR:$Rn, GPR:$Rm), IIC_iMUL64,
	"umull", "\t$RdLo, $RdHi, $Rn, $Rm",
	[(set GPR:$RdLo, GPR:$RdHi,
	(umullohi GPR:$Rn, GPR:$Rm))]>,
	Requires<[IsARM, HasV6]>,
	Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL]>;

	let Constraints = "@earlyclobber $RdLo,@earlyclobber $RdHi" in {
	def SMULLv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi),
	(ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s),
	4, IIC_iMUL64,
	[(set GPR:$RdLo, GPR:$RdHi,
	(smullohi GPR:$Rn, GPR:$Rm))],
	(SMULL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>,
	Requires<[IsARM, NoV6]>,
	Sched<[WriteMUL64Lo, WriteMUL64Hi, ReadMUL, ReadMUL]>;

	def UMULLv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi),
	(ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s),
	4, IIC_iMUL64,
	[(set GPR:$RdLo, GPR:$RdHi,
	(umullohi GPR:$Rn, GPR:$Rm))],
	(UMULL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>,
	Requires<[IsARM, NoV6]>,
	Sched<[WriteMUL64Lo, WriteMUL64Hi, ReadMUL, ReadMUL]>;
	}
	}

	// Multiply + accumulate
	def SMLAL : AsMla1I64<0b0000111, (outs GPR:$RdLo, GPR:$RdHi),
	(ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), IIC_iMAC64,
	"smlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
	RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, Requires<[IsARM, HasV6]>,
	Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL, ReadMAC, ReadMAC]>;
	def UMLAL : AsMla1I64<0b0000101, (outs GPR:$RdLo, GPR:$RdHi),
	(ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), IIC_iMAC64,
	"umlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
	RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, Requires<[IsARM, HasV6]>,
	Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL, ReadMAC, ReadMAC]>;

	def UMAAL : AMul1I <0b0000010, (outs GPR:$RdLo, GPR:$RdHi),
	(ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi),
	IIC_iMAC64,
	"umaal", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
	RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, Requires<[IsARM, HasV6]>,
	Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL, ReadMAC, ReadMAC]> {
	bits<4> RdLo;
	bits<4> RdHi;
	bits<4> Rm;
	bits<4> Rn;
	let Inst{19-16} = RdHi;
	let Inst{15-12} = RdLo;
	let Inst{11-8} = Rm;
	let Inst{3-0} = Rn;
	}

	let Constraints =
	"@earlyclobber $RdLo,@earlyclobber $RdHi,$RLo = $RdLo,$RHi = $RdHi" in {
	def SMLALv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi),
	(ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi, pred:$p, cc_out:$s),
	4, IIC_iMAC64, [],
	(SMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi,
	pred:$p, cc_out:$s)>,
	Requires<[IsARM, NoV6]>,
	Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL, ReadMAC, ReadMAC]>;
	def UMLALv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi),
	(ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi, pred:$p, cc_out:$s),
	4, IIC_iMAC64, [],
	(UMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi,
	pred:$p, cc_out:$s)>,
	Requires<[IsARM, NoV6]>,
	Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL, ReadMAC, ReadMAC]>;
	}

	} // hasSideEffects

	// Most significant word multiply
	def SMMUL : AMul2I <0b0111010, 0b0001, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
	IIC_iMUL32, "smmul", "\t$Rd, $Rn, $Rm",
	[(set GPR:$Rd, (mulhs GPR:$Rn, GPR:$Rm))]>,
	Requires<[IsARM, HasV6]>,
	Sched<[WriteMUL32, ReadMUL, ReadMUL]> {
	let Inst{15-12} = 0b1111;
	}

	def SMMULR : AMul2I <0b0111010, 0b0011, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
	IIC_iMUL32, "smmulr", "\t$Rd, $Rn, $Rm", []>,
	Requires<[IsARM, HasV6]>,
	Sched<[WriteMUL32, ReadMUL, ReadMUL]> {
	let Inst{15-12} = 0b1111;
	}

	def SMMLA : AMul2Ia <0b0111010, 0b0001, (outs GPR:$Rd),
	(ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
	IIC_iMAC32, "smmla", "\t$Rd, $Rn, $Rm, $Ra",
	[(set GPR:$Rd, (add (mulhs GPR:$Rn, GPR:$Rm), GPR:$Ra))]>,
	Requires<[IsARM, HasV6, UseMulOps]>,
	Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]>;

	def SMMLAR : AMul2Ia <0b0111010, 0b0011, (outs GPR:$Rd),
	(ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
	IIC_iMAC32, "smmlar", "\t$Rd, $Rn, $Rm, $Ra", []>,
	Requires<[IsARM, HasV6]>,
	Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]>;

	def SMMLS : AMul2Ia <0b0111010, 0b1101, (outs GPR:$Rd),
	(ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
	IIC_iMAC32, "smmls", "\t$Rd, $Rn, $Rm, $Ra", []>,
	Requires<[IsARM, HasV6, UseMulOps]>,
	Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]>;

	def SMMLSR : AMul2Ia <0b0111010, 0b1111, (outs GPR:$Rd),
	(ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
	IIC_iMAC32, "smmlsr", "\t$Rd, $Rn, $Rm, $Ra", []>,
	Requires<[IsARM, HasV6]>,
	Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]>;

	multiclass AI_smul<string opc> {
	def BB : AMulxyI<0b0001011, 0b00, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
	IIC_iMUL16, !strconcat(opc, "bb"), "\t$Rd, $Rn, $Rm",
	[(set GPR:$Rd, (mul (sext_inreg GPR:$Rn, i16),
	(sext_inreg GPR:$Rm, i16)))]>,
	Requires<[IsARM, HasV5TE]>,
	Sched<[WriteMUL16, ReadMUL, ReadMUL]>;

	def BT : AMulxyI<0b0001011, 0b10, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
	IIC_iMUL16, !strconcat(opc, "bt"), "\t$Rd, $Rn, $Rm",
	[(set GPR:$Rd, (mul (sext_inreg GPR:$Rn, i16),
	(sra GPR:$Rm, (i32 16))))]>,
	Requires<[IsARM, HasV5TE]>,
	Sched<[WriteMUL16, ReadMUL, ReadMUL]>;

	def TB : AMulxyI<0b0001011, 0b01, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
	IIC_iMUL16, !strconcat(opc, "tb"), "\t$Rd, $Rn, $Rm",
	[(set GPR:$Rd, (mul (sra GPR:$Rn, (i32 16)),
	(sext_inreg GPR:$Rm, i16)))]>,
	Requires<[IsARM, HasV5TE]>,
	Sched<[WriteMUL16, ReadMUL, ReadMUL]>;

	def TT : AMulxyI<0b0001011, 0b11, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
	IIC_iMUL16, !strconcat(opc, "tt"), "\t$Rd, $Rn, $Rm",
	[(set GPR:$Rd, (mul (sra GPR:$Rn, (i32 16)),
	(sra GPR:$Rm, (i32 16))))]>,
	Requires<[IsARM, HasV5TE]>,
	Sched<[WriteMUL16, ReadMUL, ReadMUL]>;

	def WB : AMulxyI<0b0001001, 0b01, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
	IIC_iMUL16, !strconcat(opc, "wb"), "\t$Rd, $Rn, $Rm",
	[(set GPR:$Rd, (ARMsmulwb GPR:$Rn, GPR:$Rm))]>,
	Requires<[IsARM, HasV5TE]>,
	Sched<[WriteMUL16, ReadMUL, ReadMUL]>;

	def WT : AMulxyI<0b0001001, 0b11, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
	IIC_iMUL16, !strconcat(opc, "wt"), "\t$Rd, $Rn, $Rm",
	[(set GPR:$Rd, (ARMsmulwt GPR:$Rn, GPR:$Rm))]>,
	Requires<[IsARM, HasV5TE]>,
	Sched<[WriteMUL16, ReadMUL, ReadMUL]>;
	}


	multiclass AI_smla<string opc> {
	let DecoderMethod = "DecodeSMLAInstruction" in {
	def BB : AMulxyIa<0b0001000, 0b00, (outs GPRnopc:$Rd),
	(ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
	IIC_iMAC16, !strconcat(opc, "bb"), "\t$Rd, $Rn, $Rm, $Ra",
	[(set GPRnopc:$Rd, (add GPR:$Ra,
	(mul (sext_inreg GPRnopc:$Rn, i16),
	(sext_inreg GPRnopc:$Rm, i16))))]>,
	Requires<[IsARM, HasV5TE, UseMulOps]>,
	Sched<[WriteMAC16, ReadMUL, ReadMUL, ReadMAC]>;

	def BT : AMulxyIa<0b0001000, 0b10, (outs GPRnopc:$Rd),
	(ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
	IIC_iMAC16, !strconcat(opc, "bt"), "\t$Rd, $Rn, $Rm, $Ra",
	[(set GPRnopc:$Rd,
	(add GPR:$Ra, (mul (sext_inreg GPRnopc:$Rn, i16),
	(sra GPRnopc:$Rm, (i32 16)))))]>,
	Requires<[IsARM, HasV5TE, UseMulOps]>,
	Sched<[WriteMAC16, ReadMUL, ReadMUL, ReadMAC]>;

	def TB : AMulxyIa<0b0001000, 0b01, (outs GPRnopc:$Rd),
	(ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
	IIC_iMAC16, !strconcat(opc, "tb"), "\t$Rd, $Rn, $Rm, $Ra",
	[(set GPRnopc:$Rd,
	(add GPR:$Ra, (mul (sra GPRnopc:$Rn, (i32 16)),
	(sext_inreg GPRnopc:$Rm, i16))))]>,
	Requires<[IsARM, HasV5TE, UseMulOps]>,
	Sched<[WriteMAC16, ReadMUL, ReadMUL, ReadMAC]>;

	def TT : AMulxyIa<0b0001000, 0b11, (outs GPRnopc:$Rd),
	(ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
	IIC_iMAC16, !strconcat(opc, "tt"), "\t$Rd, $Rn, $Rm, $Ra",
	[(set GPRnopc:$Rd,
	(add GPR:$Ra, (mul (sra GPRnopc:$Rn, (i32 16)),
	(sra GPRnopc:$Rm, (i32 16)))))]>,
	Requires<[IsARM, HasV5TE, UseMulOps]>,
	Sched<[WriteMAC16, ReadMUL, ReadMUL, ReadMAC]>;

	def WB : AMulxyIa<0b0001001, 0b00, (outs GPRnopc:$Rd),
	(ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
	IIC_iMAC16, !strconcat(opc, "wb"), "\t$Rd, $Rn, $Rm, $Ra",
	[(set GPRnopc:$Rd,
	(add GPR:$Ra, (ARMsmulwb GPRnopc:$Rn, GPRnopc:$Rm)))]>,
	Requires<[IsARM, HasV5TE, UseMulOps]>,
	Sched<[WriteMAC16, ReadMUL, ReadMUL, ReadMAC]>;

	def WT : AMulxyIa<0b0001001, 0b10, (outs GPRnopc:$Rd),
	(ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
	IIC_iMAC16, !strconcat(opc, "wt"), "\t$Rd, $Rn, $Rm, $Ra",
	[(set GPRnopc:$Rd,
	(add GPR:$Ra, (ARMsmulwt GPRnopc:$Rn, GPRnopc:$Rm)))]>,
	Requires<[IsARM, HasV5TE, UseMulOps]>,
	Sched<[WriteMAC16, ReadMUL, ReadMUL, ReadMAC]>;
	}
	}

	defm SMUL : AI_smul<"smul">;
	defm SMLA : AI_smla<"smla">;

	// Halfword multiply accumulate long: SMLAL<x><y>.
	class SMLAL<bits<2> opc1, string asm>
	: AMulxyI64<0b0001010, opc1,
	(outs GPRnopc:$RdLo, GPRnopc:$RdHi),
	(ins GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$RLo, GPRnopc:$RHi),
	IIC_iMAC64, asm, "\t$RdLo, $RdHi, $Rn, $Rm", []>,
	RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">,
	Requires<[IsARM, HasV5TE]>,
	Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL, ReadMAC, ReadMAC]>;

	def SMLALBB : SMLAL<0b00, "smlalbb">;
	def SMLALBT : SMLAL<0b10, "smlalbt">;
	def SMLALTB : SMLAL<0b01, "smlaltb">;
	def SMLALTT : SMLAL<0b11, "smlaltt">;

	def : ARMV5TEPat<(ARMsmlalbb GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi),
	(SMLALBB $Rn, $Rm, $RLo, $RHi)>;
	def : ARMV5TEPat<(ARMsmlalbt GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi),
	(SMLALBT $Rn, $Rm, $RLo, $RHi)>;
	def : ARMV5TEPat<(ARMsmlaltb GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi),
	(SMLALTB $Rn, $Rm, $RLo, $RHi)>;
	def : ARMV5TEPat<(ARMsmlaltt GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi),
	(SMLALTT $Rn, $Rm, $RLo, $RHi)>;

	// Helper class for AI_smld.
	class AMulDualIbase<bit long, bit sub, bit swap, dag oops, dag iops,
	InstrItinClass itin, string opc, string asm>
	: AI<oops, iops, MulFrm, itin, opc, asm, []>,
	Requires<[IsARM, HasV6]> {
	bits<4> Rn;
	bits<4> Rm;
	let Inst{27-23} = 0b01110;
	let Inst{22} = long;
	let Inst{21-20} = 0b00;
	let Inst{11-8} = Rm;
	let Inst{7} = 0;
	let Inst{6} = sub;
	let Inst{5} = swap;
	let Inst{4} = 1;
	let Inst{3-0} = Rn;
	}
	class AMulDualI<bit long, bit sub, bit swap, dag oops, dag iops,
	InstrItinClass itin, string opc, string asm>
	: AMulDualIbase<long, sub, swap, oops, iops, itin, opc, asm> {
	bits<4> Rd;
	let Inst{15-12} = 0b1111;
	let Inst{19-16} = Rd;
	}
	class AMulDualIa<bit long, bit sub, bit swap, dag oops, dag iops,
	InstrItinClass itin, string opc, string asm>
	: AMulDualIbase<long, sub, swap, oops, iops, itin, opc, asm> {
	bits<4> Ra;
	bits<4> Rd;
	let Inst{19-16} = Rd;
	let Inst{15-12} = Ra;
	}
	class AMulDualI64<bit long, bit sub, bit swap, dag oops, dag iops,
	InstrItinClass itin, string opc, string asm>
	: AMulDualIbase<long, sub, swap, oops, iops, itin, opc, asm> {
	bits<4> RdLo;
	bits<4> RdHi;
	let Inst{19-16} = RdHi;
	let Inst{15-12} = RdLo;
	}

	multiclass AI_smld<bit sub, string opc> {

	def D : AMulDualIa<0, sub, 0, (outs GPRnopc:$Rd),
	(ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
	NoItinerary, !strconcat(opc, "d"), "\t$Rd, $Rn, $Rm, $Ra">,
	Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]>;

	def DX: AMulDualIa<0, sub, 1, (outs GPRnopc:$Rd),
	(ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
	NoItinerary, !strconcat(opc, "dx"), "\t$Rd, $Rn, $Rm, $Ra">,
	Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]>;

	def LD: AMulDualI64<1, sub, 0, (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
	(ins GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$RLo, GPRnopc:$RHi),
	NoItinerary,
	!strconcat(opc, "ld"), "\t$RdLo, $RdHi, $Rn, $Rm">,
	RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">,
	Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL, ReadMAC, ReadMAC]>;

	def LDX : AMulDualI64<1, sub, 1, (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
	(ins GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$RLo, GPRnopc:$RHi),
	NoItinerary,
	!strconcat(opc, "ldx"),"\t$RdLo, $RdHi, $Rn, $Rm">,
	RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">,
	Sched<[WriteMUL64Lo, WriteMUL64Hi, ReadMUL, ReadMUL]>;
	}

	defm SMLA : AI_smld<0, "smla">;
	defm SMLS : AI_smld<1, "smls">;

	def : ARMV6Pat<(int_arm_smlad GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
	(SMLAD GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$Ra)>;
	def : ARMV6Pat<(int_arm_smladx GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
	(SMLADX GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$Ra)>;
	def : ARMV6Pat<(int_arm_smlsd GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
	(SMLSD GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$Ra)>;
	def : ARMV6Pat<(int_arm_smlsdx GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
	(SMLSDX GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$Ra)>;
	def : ARMV6Pat<(ARMSmlald GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$RLo, GPRnopc:$RHi),
	(SMLALD GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$RLo, GPRnopc:$RHi)>;
	def : ARMV6Pat<(ARMSmlaldx GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$RLo, GPRnopc:$RHi),
	(SMLALDX GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$RLo, GPRnopc:$RHi)>;
	def : ARMV6Pat<(ARMSmlsld GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$RLo, GPRnopc:$RHi),
	(SMLSLD GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$RLo, GPRnopc:$RHi)>;
	def : ARMV6Pat<(ARMSmlsldx GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$RLo, GPRnopc:$RHi),
	(SMLSLDX GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$RLo, GPRnopc:$RHi)>;

	multiclass AI_sdml<bit sub, string opc> {

	def D:AMulDualI<0, sub, 0, (outs GPRnopc:$Rd), (ins GPRnopc:$Rn, GPRnopc:$Rm),
	NoItinerary, !strconcat(opc, "d"), "\t$Rd, $Rn, $Rm">,
	Sched<[WriteMUL32, ReadMUL, ReadMUL]>;
	def DX:AMulDualI<0, sub, 1, (outs GPRnopc:$Rd),(ins GPRnopc:$Rn, GPRnopc:$Rm),
	NoItinerary, !strconcat(opc, "dx"), "\t$Rd, $Rn, $Rm">,
	Sched<[WriteMUL32, ReadMUL, ReadMUL]>;
	}

	defm SMUA : AI_sdml<0, "smua">;
	defm SMUS : AI_sdml<1, "smus">;

	def : ARMV6Pat<(int_arm_smuad GPRnopc:$Rn, GPRnopc:$Rm),
	(SMUAD GPRnopc:$Rn, GPRnopc:$Rm)>;
	def : ARMV6Pat<(int_arm_smuadx GPRnopc:$Rn, GPRnopc:$Rm),
	(SMUADX GPRnopc:$Rn, GPRnopc:$Rm)>;
	def : ARMV6Pat<(int_arm_smusd GPRnopc:$Rn, GPRnopc:$Rm),
	(SMUSD GPRnopc:$Rn, GPRnopc:$Rm)>;
	def : ARMV6Pat<(int_arm_smusdx GPRnopc:$Rn, GPRnopc:$Rm),
	(SMUSDX GPRnopc:$Rn, GPRnopc:$Rm)>;

	//===----------------------------------------------------------------------===//
	// Division Instructions (ARMv7-A with virtualization extension)
	//
	def SDIV : ADivA1I<0b001, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), IIC_iDIV,
	"sdiv", "\t$Rd, $Rn, $Rm",
	[(set GPR:$Rd, (sdiv GPR:$Rn, GPR:$Rm))]>,
	Requires<[IsARM, HasDivideInARM]>,
	Sched<[WriteDIV]>;

	def UDIV : ADivA1I<0b011, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), IIC_iDIV,
	"udiv", "\t$Rd, $Rn, $Rm",
	[(set GPR:$Rd, (udiv GPR:$Rn, GPR:$Rm))]>,
	Requires<[IsARM, HasDivideInARM]>,
	Sched<[WriteDIV]>;

	//===----------------------------------------------------------------------===//
	// Misc. Arithmetic Instructions.
	//

	def CLZ : AMiscA1I<0b00010110, 0b0001, (outs GPR:$Rd), (ins GPR:$Rm),
	IIC_iUNAr, "clz", "\t$Rd, $Rm",
	[(set GPR:$Rd, (ctlz GPR:$Rm))]>, Requires<[IsARM, HasV5T]>,
	Sched<[WriteALU]>;

	def RBIT : AMiscA1I<0b01101111, 0b0011, (outs GPR:$Rd), (ins GPR:$Rm),
	IIC_iUNAr, "rbit", "\t$Rd, $Rm",
	[(set GPR:$Rd, (bitreverse GPR:$Rm))]>,
	Requires<[IsARM, HasV6T2]>,
	Sched<[WriteALU]>;

	def REV : AMiscA1I<0b01101011, 0b0011, (outs GPR:$Rd), (ins GPR:$Rm),
	IIC_iUNAr, "rev", "\t$Rd, $Rm",
	[(set GPR:$Rd, (bswap GPR:$Rm))]>, Requires<[IsARM, HasV6]>,
	Sched<[WriteALU]>;

	let AddedComplexity = 5 in
	def REV16 : AMiscA1I<0b01101011, 0b1011, (outs GPR:$Rd), (ins GPR:$Rm),
	IIC_iUNAr, "rev16", "\t$Rd, $Rm",
	[(set GPR:$Rd, (rotr (bswap GPR:$Rm), (i32 16)))]>,
	Requires<[IsARM, HasV6]>,
	Sched<[WriteALU]>;

	def : ARMV6Pat<(srl (bswap (extloadi16 addrmode3:$addr)), (i32 16)),
	(REV16 (LDRH addrmode3:$addr))>;
	def : ARMV6Pat<(truncstorei16 (srl (bswap GPR:$Rn), (i32 16)), addrmode3:$addr),
	(STRH (REV16 GPR:$Rn), addrmode3:$addr)>;

	let AddedComplexity = 5 in
	def REVSH : AMiscA1I<0b01101111, 0b1011, (outs GPR:$Rd), (ins GPR:$Rm),
	IIC_iUNAr, "revsh", "\t$Rd, $Rm",
	[(set GPR:$Rd, (sra (bswap GPR:$Rm), (i32 16)))]>,
	Requires<[IsARM, HasV6]>,
	Sched<[WriteALU]>;

	def : ARMV6Pat<(or (sra (shl GPR:$Rm, (i32 24)), (i32 16)),
	(and (srl GPR:$Rm, (i32 8)), 0xFF)),
	(REVSH GPR:$Rm)>;

	def PKHBT : APKHI<0b01101000, 0, (outs GPRnopc:$Rd),
	(ins GPRnopc:$Rn, GPRnopc:$Rm, pkh_lsl_amt:$sh),
	IIC_iALUsi, "pkhbt", "\t$Rd, $Rn, $Rm$sh",
	[(set GPRnopc:$Rd, (or (and GPRnopc:$Rn, 0xFFFF),
	(and (shl GPRnopc:$Rm, pkh_lsl_amt:$sh),
	0xFFFF0000)))]>,
	Requires<[IsARM, HasV6]>,
	Sched<[WriteALUsi, ReadALU]>;

	// Alternate cases for PKHBT where identities eliminate some nodes.
	def : ARMV6Pat<(or (and GPRnopc:$Rn, 0xFFFF), (and GPRnopc:$Rm, 0xFFFF0000)),
	(PKHBT GPRnopc:$Rn, GPRnopc:$Rm, 0)>;
	def : ARMV6Pat<(or (and GPRnopc:$Rn, 0xFFFF), (shl GPRnopc:$Rm, imm16_31:$sh)),
	(PKHBT GPRnopc:$Rn, GPRnopc:$Rm, imm16_31:$sh)>;

	// Note: Shifts of 1-15 bits will be transformed to srl instead of sra and
	// will match the pattern below.
	def PKHTB : APKHI<0b01101000, 1, (outs GPRnopc:$Rd),
	(ins GPRnopc:$Rn, GPRnopc:$Rm, pkh_asr_amt:$sh),
	IIC_iBITsi, "pkhtb", "\t$Rd, $Rn, $Rm$sh",
	[(set GPRnopc:$Rd, (or (and GPRnopc:$Rn, 0xFFFF0000),
	(and (sra GPRnopc:$Rm, pkh_asr_amt:$sh),
	0xFFFF)))]>,
	Requires<[IsARM, HasV6]>,
	Sched<[WriteALUsi, ReadALU]>;

	// Alternate cases for PKHTB where identities eliminate some nodes. Note that
	// a shift amount of 0 is not legal here, it is PKHBT instead.
	// We also can not replace a srl (17..31) by an arithmetic shift we would use in
	// pkhtb src1, src2, asr (17..31).
	def : ARMV6Pat<(or (and GPRnopc:$src1, 0xFFFF0000),
	(srl GPRnopc:$src2, imm16:$sh)),
	(PKHTB GPRnopc:$src1, GPRnopc:$src2, imm16:$sh)>;
	def : ARMV6Pat<(or (and GPRnopc:$src1, 0xFFFF0000),
	(sra GPRnopc:$src2, imm16_31:$sh)),
	(PKHTB GPRnopc:$src1, GPRnopc:$src2, imm16_31:$sh)>;
	def : ARMV6Pat<(or (and GPRnopc:$src1, 0xFFFF0000),
	(and (srl GPRnopc:$src2, imm1_15:$sh), 0xFFFF)),
	(PKHTB GPRnopc:$src1, GPRnopc:$src2, imm1_15:$sh)>;

	//===----------------------------------------------------------------------===//
	// CRC Instructions
	//
	// Polynomials:
	// + CRC32{B,H,W} 0x04C11DB7
	// + CRC32C{B,H,W} 0x1EDC6F41
	//

	class AI_crc32<bit C, bits<2> sz, string suffix, SDPatternOperator builtin>
	: AInoP<(outs GPRnopc:$Rd), (ins GPRnopc:$Rn, GPRnopc:$Rm), MiscFrm, NoItinerary,
	!strconcat("crc32", suffix), "\t$Rd, $Rn, $Rm",
	[(set GPRnopc:$Rd, (builtin GPRnopc:$Rn, GPRnopc:$Rm))]>,
	Requires<[IsARM, HasV8, HasCRC]> {
	bits<4> Rd;
	bits<4> Rn;
	bits<4> Rm;

	let Inst{31-28} = 0b1110;
	let Inst{27-23} = 0b00010;
	let Inst{22-21} = sz;
	let Inst{20} = 0;
	let Inst{19-16} = Rn;
	let Inst{15-12} = Rd;
	let Inst{11-10} = 0b00;
	let Inst{9} = C;
	let Inst{8} = 0;
	let Inst{7-4} = 0b0100;
	let Inst{3-0} = Rm;

	let Unpredictable{11-8} = 0b1101;
	}

	def CRC32B : AI_crc32<0, 0b00, "b", int_arm_crc32b>;
	def CRC32CB : AI_crc32<1, 0b00, "cb", int_arm_crc32cb>;
	def CRC32H : AI_crc32<0, 0b01, "h", int_arm_crc32h>;
	def CRC32CH : AI_crc32<1, 0b01, "ch", int_arm_crc32ch>;
	def CRC32W : AI_crc32<0, 0b10, "w", int_arm_crc32w>;
	def CRC32CW : AI_crc32<1, 0b10, "cw", int_arm_crc32cw>;

	//===----------------------------------------------------------------------===//
	// ARMv8.1a Privilege Access Never extension
	//
	// SETPAN #imm1

	def SETPAN : AInoP<(outs), (ins imm0_1:$imm), MiscFrm, NoItinerary, "setpan",
	"\t$imm", []>, Requires<[IsARM, HasV8, HasV8_1a]> {
	bits<1> imm;

	let Inst{31-28} = 0b1111;
	let Inst{27-20} = 0b00010001;
	let Inst{19-16} = 0b0000;
	let Inst{15-10} = 0b000000;
	let Inst{9} = imm;
	let Inst{8} = 0b0;
	let Inst{7-4} = 0b0000;
	let Inst{3-0} = 0b0000;

	let Unpredictable{19-16} = 0b1111;
	let Unpredictable{15-10} = 0b111111;
	let Unpredictable{8} = 0b1;
	let Unpredictable{3-0} = 0b1111;
	}

	//===----------------------------------------------------------------------===//
	// Comparison Instructions...
	//

	defm CMP : AI1_cmp_irs<0b1010, "cmp",
	IIC_iCMPi, IIC_iCMPr, IIC_iCMPsr, ARMcmp>;

	// ARMcmpZ can re-use the above instruction definitions.
	def : ARMPat<(ARMcmpZ GPR:$src, mod_imm:$imm),
	(CMPri GPR:$src, mod_imm:$imm)>;
	def : ARMPat<(ARMcmpZ GPR:$src, GPR:$rhs),
	(CMPrr GPR:$src, GPR:$rhs)>;
	def : ARMPat<(ARMcmpZ GPR:$src, so_reg_imm:$rhs),
	(CMPrsi GPR:$src, so_reg_imm:$rhs)>;
	def : ARMPat<(ARMcmpZ GPR:$src, so_reg_reg:$rhs),
	(CMPrsr GPR:$src, so_reg_reg:$rhs)>;

	// CMN register-integer
	let isCompare = 1, Defs = [CPSR] in {
	def CMNri : AI1<0b1011, (outs), (ins GPR:$Rn, mod_imm:$imm), DPFrm, IIC_iCMPi,
	"cmn", "\t$Rn, $imm",
	[(ARMcmn GPR:$Rn, mod_imm:$imm)]>,
	Sched<[WriteCMP, ReadALU]> {
	bits<4> Rn;
	bits<12> imm;
	let Inst{25} = 1;
	let Inst{20} = 1;
	let Inst{19-16} = Rn;
	let Inst{15-12} = 0b0000;
	let Inst{11-0} = imm;

	let Unpredictable{15-12} = 0b1111;
	}

	// CMN register-register/shift
	def CMNzrr : AI1<0b1011, (outs), (ins GPR:$Rn, GPR:$Rm), DPFrm, IIC_iCMPr,
	"cmn", "\t$Rn, $Rm",
	[(BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>
	GPR:$Rn, GPR:$Rm)]>, Sched<[WriteCMP, ReadALU, ReadALU]> {
	bits<4> Rn;
	bits<4> Rm;
	let isCommutable = 1;
	let Inst{25} = 0;
	let Inst{20} = 1;
	let Inst{19-16} = Rn;
	let Inst{15-12} = 0b0000;
	let Inst{11-4} = 0b00000000;
	let Inst{3-0} = Rm;

	let Unpredictable{15-12} = 0b1111;
	}

	def CMNzrsi : AI1<0b1011, (outs),
	(ins GPR:$Rn, so_reg_imm:$shift), DPSoRegImmFrm, IIC_iCMPsr,
	"cmn", "\t$Rn, $shift",
	[(BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>
	GPR:$Rn, so_reg_imm:$shift)]>,
	Sched<[WriteCMPsi, ReadALU]> {
	bits<4> Rn;
	bits<12> shift;
	let Inst{25} = 0;
	let Inst{20} = 1;
	let Inst{19-16} = Rn;
	let Inst{15-12} = 0b0000;
	let Inst{11-5} = shift{11-5};
	let Inst{4} = 0;
	let Inst{3-0} = shift{3-0};

	let Unpredictable{15-12} = 0b1111;
	}

	def CMNzrsr : AI1<0b1011, (outs),
	(ins GPRnopc:$Rn, so_reg_reg:$shift), DPSoRegRegFrm, IIC_iCMPsr,
	"cmn", "\t$Rn, $shift",
	[(BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>
	GPRnopc:$Rn, so_reg_reg:$shift)]>,
	Sched<[WriteCMPsr, ReadALU]> {
	bits<4> Rn;
	bits<12> shift;
	let Inst{25} = 0;
	let Inst{20} = 1;
	let Inst{19-16} = Rn;
	let Inst{15-12} = 0b0000;
	let Inst{11-8} = shift{11-8};
	let Inst{7} = 0;
	let Inst{6-5} = shift{6-5};
	let Inst{4} = 1;
	let Inst{3-0} = shift{3-0};

	let Unpredictable{15-12} = 0b1111;
	}

	}

	def : ARMPat<(ARMcmp GPR:$src, mod_imm_neg:$imm),
	(CMNri GPR:$src, mod_imm_neg:$imm)>;

	def : ARMPat<(ARMcmpZ GPR:$src, mod_imm_neg:$imm),
	(CMNri GPR:$src, mod_imm_neg:$imm)>;

	// Note that TST/TEQ don't set all the same flags that CMP does!
	defm TST : AI1_cmp_irs<0b1000, "tst",
	IIC_iTSTi, IIC_iTSTr, IIC_iTSTsr,
	BinOpFrag<(ARMcmpZ (and_su node:$LHS, node:$RHS), 0)>, 1,
	"DecodeTSTInstruction">;
	defm TEQ : AI1_cmp_irs<0b1001, "teq",
	IIC_iTSTi, IIC_iTSTr, IIC_iTSTsr,
	BinOpFrag<(ARMcmpZ (xor_su node:$LHS, node:$RHS), 0)>, 1>;

	// Pseudo i64 compares for some floating point compares.
	let usesCustomInserter = 1, isBranch = 1, isTerminator = 1,
	Defs = [CPSR] in {
	def BCCi64 : PseudoInst<(outs),
	(ins i32imm:$cc, GPR:$lhs1, GPR:$lhs2, GPR:$rhs1, GPR:$rhs2, brtarget:$dst),
	IIC_Br,
	[(ARMBcci64 imm:$cc, GPR:$lhs1, GPR:$lhs2, GPR:$rhs1, GPR:$rhs2, bb:$dst)]>,
	Sched<[WriteBr]>;

	def BCCZi64 : PseudoInst<(outs),
	(ins i32imm:$cc, GPR:$lhs1, GPR:$lhs2, brtarget:$dst), IIC_Br,
	[(ARMBcci64 imm:$cc, GPR:$lhs1, GPR:$lhs2, 0, 0, bb:$dst)]>,
	Sched<[WriteBr]>;
	} // usesCustomInserter


	// Conditional moves
	let hasSideEffects = 0 in {

	let isCommutable = 1, isSelect = 1 in
	def MOVCCr : ARMPseudoInst<(outs GPR:$Rd),
	(ins GPR:$false, GPR:$Rm, cmovpred:$p),
	4, IIC_iCMOVr,
	[(set GPR:$Rd, (ARMcmov GPR:$false, GPR:$Rm,
	cmovpred:$p))]>,
	RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;

	def MOVCCsi : ARMPseudoInst<(outs GPR:$Rd),
	(ins GPR:$false, so_reg_imm:$shift, cmovpred:$p),
	4, IIC_iCMOVsr,
	[(set GPR:$Rd,
	(ARMcmov GPR:$false, so_reg_imm:$shift,
	cmovpred:$p))]>,
	RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
	def MOVCCsr : ARMPseudoInst<(outs GPR:$Rd),
	(ins GPR:$false, so_reg_reg:$shift, cmovpred:$p),
	4, IIC_iCMOVsr,
	[(set GPR:$Rd, (ARMcmov GPR:$false, so_reg_reg:$shift,
	cmovpred:$p))]>,
	RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;


	let isMoveImm = 1 in
	def MOVCCi16
	: ARMPseudoInst<(outs GPR:$Rd),
	(ins GPR:$false, imm0_65535_expr:$imm, cmovpred:$p),
	4, IIC_iMOVi,
	[(set GPR:$Rd, (ARMcmov GPR:$false, imm0_65535:$imm,
	cmovpred:$p))]>,
	RegConstraint<"$false = $Rd">, Requires<[IsARM, HasV6T2]>,
	Sched<[WriteALU]>;

	let isMoveImm = 1 in
	def MOVCCi : ARMPseudoInst<(outs GPR:$Rd),
	(ins GPR:$false, mod_imm:$imm, cmovpred:$p),
	4, IIC_iCMOVi,
	[(set GPR:$Rd, (ARMcmov GPR:$false, mod_imm:$imm,
	cmovpred:$p))]>,
	RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;

	// Two instruction predicate mov immediate.
	let isMoveImm = 1 in
	def MOVCCi32imm
	: ARMPseudoInst<(outs GPR:$Rd),
	(ins GPR:$false, i32imm:$src, cmovpred:$p),
	8, IIC_iCMOVix2,
	[(set GPR:$Rd, (ARMcmov GPR:$false, imm:$src,
	cmovpred:$p))]>,
	RegConstraint<"$false = $Rd">, Requires<[IsARM, HasV6T2]>;

	let isMoveImm = 1 in
	def MVNCCi : ARMPseudoInst<(outs GPR:$Rd),
	(ins GPR:$false, mod_imm:$imm, cmovpred:$p),
	4, IIC_iCMOVi,
	[(set GPR:$Rd, (ARMcmov GPR:$false, mod_imm_not:$imm,
	cmovpred:$p))]>,
	RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;

	} // hasSideEffects


	//===----------------------------------------------------------------------===//
	// Atomic operations intrinsics
	//

	def MemBarrierOptOperand : AsmOperandClass {
	let Name = "MemBarrierOpt";
	let ParserMethod = "parseMemBarrierOptOperand";
	}
	def memb_opt : Operand<i32> {
	let PrintMethod = "printMemBOption";
	let ParserMatchClass = MemBarrierOptOperand;
	let DecoderMethod = "DecodeMemBarrierOption";
	}

	def InstSyncBarrierOptOperand : AsmOperandClass {
	let Name = "InstSyncBarrierOpt";
	let ParserMethod = "parseInstSyncBarrierOptOperand";
	}
	def instsyncb_opt : Operand<i32> {
	let PrintMethod = "printInstSyncBOption";
	let ParserMatchClass = InstSyncBarrierOptOperand;
	let DecoderMethod = "DecodeInstSyncBarrierOption";
	}

	// Memory barriers protect the atomic sequences
	let hasSideEffects = 1 in {
	def DMB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary,
	"dmb", "\t$opt", [(int_arm_dmb (i32 imm0_15:$opt))]>,
	Requires<[IsARM, HasDB]> {
	bits<4> opt;
	let Inst{31-4} = 0xf57ff05;
	let Inst{3-0} = opt;
	}

	def DSB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary,
	"dsb", "\t$opt", [(int_arm_dsb (i32 imm0_15:$opt))]>,
	Requires<[IsARM, HasDB]> {
	bits<4> opt;
	let Inst{31-4} = 0xf57ff04;
	let Inst{3-0} = opt;
	}

	// ISB has only full system option
	def ISB : AInoP<(outs), (ins instsyncb_opt:$opt), MiscFrm, NoItinerary,
	"isb", "\t$opt", [(int_arm_isb (i32 imm0_15:$opt))]>,
	Requires<[IsARM, HasDB]> {
	bits<4> opt;
	let Inst{31-4} = 0xf57ff06;
	let Inst{3-0} = opt;
	}
	}

	let usesCustomInserter = 1, Defs = [CPSR] in {

	// Pseudo instruction that combines movs + predicated rsbmi
	// to implement integer ABS
	def ABS : ARMPseudoInst<(outs GPR:$dst), (ins GPR:$src), 8, NoItinerary, []>;
	}

	let usesCustomInserter = 1 in {
	def COPY_STRUCT_BYVAL_I32 : PseudoInst<
	(outs), (ins GPR:$dst, GPR:$src, i32imm:$size, i32imm:$alignment),
	NoItinerary,
	[(ARMcopystructbyval GPR:$dst, GPR:$src, imm:$size, imm:$alignment)]>;
	}

	let hasPostISelHook = 1, Constraints = "$newdst = $dst, $newsrc = $src" in {
	// %newsrc, %newdst = MEMCPY %dst, %src, N, ...N scratch regs...
	// Copies N registers worth of memory from address %src to address %dst
	// and returns the incremented addresses. N scratch register will
	// be attached for the copy to use.
	def MEMCPY : PseudoInst<
	(outs GPR:$newdst, GPR:$newsrc),
	(ins GPR:$dst, GPR:$src, i32imm:$nreg, variable_ops),
	NoItinerary,
	[(set GPR:$newdst, GPR:$newsrc,
	(ARMmemcopy GPR:$dst, GPR:$src, imm:$nreg))]>;
	}

	def ldrex_1 : PatFrag<(ops node:$ptr), (int_arm_ldrex node:$ptr), [{
	return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
	}]>;

	def ldrex_2 : PatFrag<(ops node:$ptr), (int_arm_ldrex node:$ptr), [{
	return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
	}]>;

	def ldrex_4 : PatFrag<(ops node:$ptr), (int_arm_ldrex node:$ptr), [{
	return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
	}]>;

	def strex_1 : PatFrag<(ops node:$val, node:$ptr),
	(int_arm_strex node:$val, node:$ptr), [{
	return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
	}]>;

	def strex_2 : PatFrag<(ops node:$val, node:$ptr),
	(int_arm_strex node:$val, node:$ptr), [{
	return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
	}]>;

	def strex_4 : PatFrag<(ops node:$val, node:$ptr),
	(int_arm_strex node:$val, node:$ptr), [{
	return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
	}]>;

	def ldaex_1 : PatFrag<(ops node:$ptr), (int_arm_ldaex node:$ptr), [{
	return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
	}]>;

	def ldaex_2 : PatFrag<(ops node:$ptr), (int_arm_ldaex node:$ptr), [{
	return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
	}]>;

	def ldaex_4 : PatFrag<(ops node:$ptr), (int_arm_ldaex node:$ptr), [{
	return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
	}]>;

	def stlex_1 : PatFrag<(ops node:$val, node:$ptr),
	(int_arm_stlex node:$val, node:$ptr), [{
	return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
	}]>;

	def stlex_2 : PatFrag<(ops node:$val, node:$ptr),
	(int_arm_stlex node:$val, node:$ptr), [{
	return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
	}]>;

	def stlex_4 : PatFrag<(ops node:$val, node:$ptr),
	(int_arm_stlex node:$val, node:$ptr), [{
	return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
	}]>;

	let mayLoad = 1 in {
	def LDREXB : AIldrex<0b10, (outs GPR:$Rt), (ins addr_offset_none:$addr),
	NoItinerary, "ldrexb", "\t$Rt, $addr",
	[(set GPR:$Rt, (ldrex_1 addr_offset_none:$addr))]>;
	def LDREXH : AIldrex<0b11, (outs GPR:$Rt), (ins addr_offset_none:$addr),
	NoItinerary, "ldrexh", "\t$Rt, $addr",
	[(set GPR:$Rt, (ldrex_2 addr_offset_none:$addr))]>;
	def LDREX : AIldrex<0b00, (outs GPR:$Rt), (ins addr_offset_none:$addr),
	NoItinerary, "ldrex", "\t$Rt, $addr",
	[(set GPR:$Rt, (ldrex_4 addr_offset_none:$addr))]>;
	let hasExtraDefRegAllocReq = 1 in
	def LDREXD : AIldrex<0b01, (outs GPRPairOp:$Rt),(ins addr_offset_none:$addr),
	NoItinerary, "ldrexd", "\t$Rt, $addr", []> {
	let DecoderMethod = "DecodeDoubleRegLoad";
	}

	def LDAEXB : AIldaex<0b10, (outs GPR:$Rt), (ins addr_offset_none:$addr),
	NoItinerary, "ldaexb", "\t$Rt, $addr",
	[(set GPR:$Rt, (ldaex_1 addr_offset_none:$addr))]>;
	def LDAEXH : AIldaex<0b11, (outs GPR:$Rt), (ins addr_offset_none:$addr),
	NoItinerary, "ldaexh", "\t$Rt, $addr",
	[(set GPR:$Rt, (ldaex_2 addr_offset_none:$addr))]>;
	def LDAEX : AIldaex<0b00, (outs GPR:$Rt), (ins addr_offset_none:$addr),
	NoItinerary, "ldaex", "\t$Rt, $addr",
	[(set GPR:$Rt, (ldaex_4 addr_offset_none:$addr))]>;
	let hasExtraDefRegAllocReq = 1 in
	def LDAEXD : AIldaex<0b01, (outs GPRPairOp:$Rt),(ins addr_offset_none:$addr),
	NoItinerary, "ldaexd", "\t$Rt, $addr", []> {
	let DecoderMethod = "DecodeDoubleRegLoad";
	}
	}

	let mayStore = 1, Constraints = "@earlyclobber $Rd" in {
	def STREXB: AIstrex<0b10, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr),
	NoItinerary, "strexb", "\t$Rd, $Rt, $addr",
	[(set GPR:$Rd, (strex_1 GPR:$Rt,
	addr_offset_none:$addr))]>;
	def STREXH: AIstrex<0b11, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr),
	NoItinerary, "strexh", "\t$Rd, $Rt, $addr",
	[(set GPR:$Rd, (strex_2 GPR:$Rt,
	addr_offset_none:$addr))]>;
	def STREX : AIstrex<0b00, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr),
	NoItinerary, "strex", "\t$Rd, $Rt, $addr",
	[(set GPR:$Rd, (strex_4 GPR:$Rt,
	addr_offset_none:$addr))]>;
	let hasExtraSrcRegAllocReq = 1 in
	def STREXD : AIstrex<0b01, (outs GPR:$Rd),
	(ins GPRPairOp:$Rt, addr_offset_none:$addr),
	NoItinerary, "strexd", "\t$Rd, $Rt, $addr", []> {
	let DecoderMethod = "DecodeDoubleRegStore";
	}
	def STLEXB: AIstlex<0b10, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr),
	NoItinerary, "stlexb", "\t$Rd, $Rt, $addr",
	[(set GPR:$Rd,
	(stlex_1 GPR:$Rt, addr_offset_none:$addr))]>;
	def STLEXH: AIstlex<0b11, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr),
	NoItinerary, "stlexh", "\t$Rd, $Rt, $addr",
	[(set GPR:$Rd,
	(stlex_2 GPR:$Rt, addr_offset_none:$addr))]>;
	def STLEX : AIstlex<0b00, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr),
	NoItinerary, "stlex", "\t$Rd, $Rt, $addr",
	[(set GPR:$Rd,
	(stlex_4 GPR:$Rt, addr_offset_none:$addr))]>;
	let hasExtraSrcRegAllocReq = 1 in
	def STLEXD : AIstlex<0b01, (outs GPR:$Rd),
	(ins GPRPairOp:$Rt, addr_offset_none:$addr),
	NoItinerary, "stlexd", "\t$Rd, $Rt, $addr", []> {
	let DecoderMethod = "DecodeDoubleRegStore";
	}
	}

	def CLREX : AXI<(outs), (ins), MiscFrm, NoItinerary, "clrex",
	[(int_arm_clrex)]>,
	Requires<[IsARM, HasV6K]> {
	let Inst{31-0} = 0b11110101011111111111000000011111;
	}

	def : ARMPat<(strex_1 (and GPR:$Rt, 0xff), addr_offset_none:$addr),
	(STREXB GPR:$Rt, addr_offset_none:$addr)>;
	def : ARMPat<(strex_2 (and GPR:$Rt, 0xffff), addr_offset_none:$addr),
	(STREXH GPR:$Rt, addr_offset_none:$addr)>;

	def : ARMPat<(stlex_1 (and GPR:$Rt, 0xff), addr_offset_none:$addr),
	(STLEXB GPR:$Rt, addr_offset_none:$addr)>;
	def : ARMPat<(stlex_2 (and GPR:$Rt, 0xffff), addr_offset_none:$addr),
	(STLEXH GPR:$Rt, addr_offset_none:$addr)>;

	class acquiring_load<PatFrag base>
	: PatFrag<(ops node:$ptr), (base node:$ptr), [{
	AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
	return isAcquireOrStronger(Ordering);
	}]>;

	def atomic_load_acquire_8 : acquiring_load<atomic_load_8>;
	def atomic_load_acquire_16 : acquiring_load<atomic_load_16>;
	def atomic_load_acquire_32 : acquiring_load<atomic_load_32>;

	class releasing_store<PatFrag base>
	: PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
	AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
	return isReleaseOrStronger(Ordering);
	}]>;

	def atomic_store_release_8 : releasing_store<atomic_store_8>;
	def atomic_store_release_16 : releasing_store<atomic_store_16>;
	def atomic_store_release_32 : releasing_store<atomic_store_32>;

	let AddedComplexity = 8 in {
	def : ARMPat<(atomic_load_acquire_8 addr_offset_none:$addr), (LDAB addr_offset_none:$addr)>;
	def : ARMPat<(atomic_load_acquire_16 addr_offset_none:$addr), (LDAH addr_offset_none:$addr)>;
	def : ARMPat<(atomic_load_acquire_32 addr_offset_none:$addr), (LDA addr_offset_none:$addr)>;
	def : ARMPat<(atomic_store_release_8 addr_offset_none:$addr, GPR:$val), (STLB GPR:$val, addr_offset_none:$addr)>;
	def : ARMPat<(atomic_store_release_16 addr_offset_none:$addr, GPR:$val), (STLH GPR:$val, addr_offset_none:$addr)>;
	def : ARMPat<(atomic_store_release_32 addr_offset_none:$addr, GPR:$val), (STL GPR:$val, addr_offset_none:$addr)>;
	}

	// SWP/SWPB are deprecated in V6/V7 and optional in v7VE.
	// FIXME Use InstAlias to generate LDREX/STREX pairs instead.
	let mayLoad = 1, mayStore = 1 in {
	def SWP : AIswp<0, (outs GPRnopc:$Rt),
	(ins GPRnopc:$Rt2, addr_offset_none:$addr), "swp", []>,
	Requires<[IsARM,PreV8]>;
	def SWPB: AIswp<1, (outs GPRnopc:$Rt),
	(ins GPRnopc:$Rt2, addr_offset_none:$addr), "swpb", []>,
	Requires<[IsARM,PreV8]>;
	}

	//===----------------------------------------------------------------------===//
	// Coprocessor Instructions.
	//

	def CDP : ABI<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
	c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2),
	NoItinerary, "cdp", "\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
	[(int_arm_cdp imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn,
	imm:$CRm, imm:$opc2)]>,
	Requires<[IsARM,PreV8]> {
	bits<4> opc1;
	bits<4> CRn;
	bits<4> CRd;
	bits<4> cop;
	bits<3> opc2;
	bits<4> CRm;

	let Inst{3-0} = CRm;
	let Inst{4} = 0;
	let Inst{7-5} = opc2;
	let Inst{11-8} = cop;
	let Inst{15-12} = CRd;
	let Inst{19-16} = CRn;
	let Inst{23-20} = opc1;
	}

	def CDP2 : ABXI<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
	c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2),
	NoItinerary, "cdp2\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
	[(int_arm_cdp2 imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn,
	imm:$CRm, imm:$opc2)]>,
	Requires<[IsARM,PreV8]> {
	let Inst{31-28} = 0b1111;
	bits<4> opc1;
	bits<4> CRn;
	bits<4> CRd;
	bits<4> cop;
	bits<3> opc2;
	bits<4> CRm;

	let Inst{3-0} = CRm;
	let Inst{4} = 0;
	let Inst{7-5} = opc2;
	let Inst{11-8} = cop;
	let Inst{15-12} = CRd;
	let Inst{19-16} = CRn;
	let Inst{23-20} = opc1;
	}

	class ACI<dag oops, dag iops, string opc, string asm,
	list<dag> pattern, IndexMode im = IndexModeNone>
	: I<oops, iops, AddrModeNone, 4, im, BrFrm, NoItinerary,
	opc, asm, "", pattern> {
	let Inst{27-25} = 0b110;
	}
	class ACInoP<dag oops, dag iops, string opc, string asm,
	list<dag> pattern, IndexMode im = IndexModeNone>
	: InoP<oops, iops, AddrModeNone, 4, im, BrFrm, NoItinerary,
	opc, asm, "", pattern> {
	let Inst{31-28} = 0b1111;
	let Inst{27-25} = 0b110;
	}
	multiclass LdStCop<bit load, bit Dbit, string asm, list<dag> pattern> {
	def _OFFSET : ACI<(outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5:$addr),
	asm, "\t$cop, $CRd, $addr", pattern> {
	bits<13> addr;
	bits<4> cop;
	bits<4> CRd;
	let Inst{24} = 1; // P = 1
	let Inst{23} = addr{8};
	let Inst{22} = Dbit;
	let Inst{21} = 0; // W = 0
	let Inst{20} = load;
	let Inst{19-16} = addr{12-9};
	let Inst{15-12} = CRd;
	let Inst{11-8} = cop;
	let Inst{7-0} = addr{7-0};
	let DecoderMethod = "DecodeCopMemInstruction";
	}
	def _PRE : ACI<(outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5_pre:$addr),
	asm, "\t$cop, $CRd, $addr!", [], IndexModePre> {
	bits<13> addr;
	bits<4> cop;
	bits<4> CRd;
	let Inst{24} = 1; // P = 1
	let Inst{23} = addr{8};
	let Inst{22} = Dbit;
	let Inst{21} = 1; // W = 1
	let Inst{20} = load;
	let Inst{19-16} = addr{12-9};
	let Inst{15-12} = CRd;
	let Inst{11-8} = cop;
	let Inst{7-0} = addr{7-0};
	let DecoderMethod = "DecodeCopMemInstruction";
	}
	def _POST: ACI<(outs), (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr,
	postidx_imm8s4:$offset),
	asm, "\t$cop, $CRd, $addr, $offset", [], IndexModePost> {
	bits<9> offset;
	bits<4> addr;
	bits<4> cop;
	bits<4> CRd;
	let Inst{24} = 0; // P = 0
	let Inst{23} = offset{8};
	let Inst{22} = Dbit;
	let Inst{21} = 1; // W = 1
	let Inst{20} = load;
	let Inst{19-16} = addr;
	let Inst{15-12} = CRd;
	let Inst{11-8} = cop;
	let Inst{7-0} = offset{7-0};
	let DecoderMethod = "DecodeCopMemInstruction";
	}
	def _OPTION : ACI<(outs),
	(ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr,
	coproc_option_imm:$option),
	asm, "\t$cop, $CRd, $addr, $option", []> {
	bits<8> option;
	bits<4> addr;
	bits<4> cop;
	bits<4> CRd;
	let Inst{24} = 0; // P = 0
	let Inst{23} = 1; // U = 1
	let Inst{22} = Dbit;
	let Inst{21} = 0; // W = 0
	let Inst{20} = load;
	let Inst{19-16} = addr;
	let Inst{15-12} = CRd;
	let Inst{11-8} = cop;
	let Inst{7-0} = option;
	let DecoderMethod = "DecodeCopMemInstruction";
	}
	}
	multiclass LdSt2Cop<bit load, bit Dbit, string asm, list<dag> pattern> {
	def _OFFSET : ACInoP<(outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5:$addr),
	asm, "\t$cop, $CRd, $addr", pattern> {
	bits<13> addr;
	bits<4> cop;
	bits<4> CRd;
	let Inst{24} = 1; // P = 1
	let Inst{23} = addr{8};
	let Inst{22} = Dbit;
	let Inst{21} = 0; // W = 0
	let Inst{20} = load;
	let Inst{19-16} = addr{12-9};
	let Inst{15-12} = CRd;
	let Inst{11-8} = cop;
	let Inst{7-0} = addr{7-0};
	let DecoderMethod = "DecodeCopMemInstruction";
	}
	def _PRE : ACInoP<(outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5_pre:$addr),
	asm, "\t$cop, $CRd, $addr!", [], IndexModePre> {
	bits<13> addr;
	bits<4> cop;
	bits<4> CRd;
	let Inst{24} = 1; // P = 1
	let Inst{23} = addr{8};
	let Inst{22} = Dbit;
	let Inst{21} = 1; // W = 1
	let Inst{20} = load;
	let Inst{19-16} = addr{12-9};
	let Inst{15-12} = CRd;
	let Inst{11-8} = cop;
	let Inst{7-0} = addr{7-0};
	let DecoderMethod = "DecodeCopMemInstruction";
	}
	def _POST: ACInoP<(outs), (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr,
	postidx_imm8s4:$offset),
	asm, "\t$cop, $CRd, $addr, $offset", [], IndexModePost> {
	bits<9> offset;
	bits<4> addr;
	bits<4> cop;
	bits<4> CRd;
	let Inst{24} = 0; // P = 0
	let Inst{23} = offset{8};
	let Inst{22} = Dbit;
	let Inst{21} = 1; // W = 1
	let Inst{20} = load;
	let Inst{19-16} = addr;
	let Inst{15-12} = CRd;
	let Inst{11-8} = cop;
	let Inst{7-0} = offset{7-0};
	let DecoderMethod = "DecodeCopMemInstruction";
	}
	def _OPTION : ACInoP<(outs),
	(ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr,
	coproc_option_imm:$option),
	asm, "\t$cop, $CRd, $addr, $option", []> {
	bits<8> option;
	bits<4> addr;
	bits<4> cop;
	bits<4> CRd;
	let Inst{24} = 0; // P = 0
	let Inst{23} = 1; // U = 1
	let Inst{22} = Dbit;
	let Inst{21} = 0; // W = 0
	let Inst{20} = load;
	let Inst{19-16} = addr;
	let Inst{15-12} = CRd;
	let Inst{11-8} = cop;
	let Inst{7-0} = option;
	let DecoderMethod = "DecodeCopMemInstruction";
	}
	}

	defm LDC : LdStCop <1, 0, "ldc", [(int_arm_ldc imm:$cop, imm:$CRd, addrmode5:$addr)]>;
	defm LDCL : LdStCop <1, 1, "ldcl", [(int_arm_ldcl imm:$cop, imm:$CRd, addrmode5:$addr)]>;
	defm LDC2 : LdSt2Cop<1, 0, "ldc2", [(int_arm_ldc2 imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>;
	defm LDC2L : LdSt2Cop<1, 1, "ldc2l", [(int_arm_ldc2l imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>;

	defm STC : LdStCop <0, 0, "stc", [(int_arm_stc imm:$cop, imm:$CRd, addrmode5:$addr)]>;
	defm STCL : LdStCop <0, 1, "stcl", [(int_arm_stcl imm:$cop, imm:$CRd, addrmode5:$addr)]>;
	defm STC2 : LdSt2Cop<0, 0, "stc2", [(int_arm_stc2 imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>;
	defm STC2L : LdSt2Cop<0, 1, "stc2l", [(int_arm_stc2l imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>;

	//===----------------------------------------------------------------------===//
	// Move between coprocessor and ARM core register.
	//

	class MovRCopro<string opc, bit direction, dag oops, dag iops,
	list<dag> pattern>
	: ABI<0b1110, oops, iops, NoItinerary, opc,
	"\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2", pattern> {
	let Inst{20} = direction;
	let Inst{4} = 1;

	bits<4> Rt;
	bits<4> cop;
	bits<3> opc1;
	bits<3> opc2;
	bits<4> CRm;
	bits<4> CRn;

	let Inst{15-12} = Rt;
	let Inst{11-8} = cop;
	let Inst{23-21} = opc1;
	let Inst{7-5} = opc2;
	let Inst{3-0} = CRm;
	let Inst{19-16} = CRn;
	}

	def MCR : MovRCopro<"mcr", 0 /* from ARM core register to coprocessor */,
	(outs),
	(ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
	c_imm:$CRm, imm0_7:$opc2),
	[(int_arm_mcr imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
	imm:$CRm, imm:$opc2)]>,
	ComplexDeprecationPredicate<"MCR">;
	def : ARMInstAlias<"mcr${p} $cop, $opc1, $Rt, $CRn, $CRm",
	(MCR p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
	c_imm:$CRm, 0, pred:$p)>;
	def MRC : MovRCopro<"mrc", 1 /* from coprocessor to ARM core register */,
	(outs GPRwithAPSR:$Rt),
	(ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm,
	imm0_7:$opc2), []>;
	def : ARMInstAlias<"mrc${p} $cop, $opc1, $Rt, $CRn, $CRm",
	(MRC GPRwithAPSR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
	c_imm:$CRm, 0, pred:$p)>;

	def : ARMPat<(int_arm_mrc imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2),
	(MRC imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>;

	class MovRCopro2<string opc, bit direction, dag oops, dag iops,
	list<dag> pattern>
	: ABXI<0b1110, oops, iops, NoItinerary,
	!strconcat(opc, "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2"), pattern> {
	let Inst{31-24} = 0b11111110;
	let Inst{20} = direction;
	let Inst{4} = 1;

	bits<4> Rt;
	bits<4> cop;
	bits<3> opc1;
	bits<3> opc2;
	bits<4> CRm;
	bits<4> CRn;

	let Inst{15-12} = Rt;
	let Inst{11-8} = cop;
	let Inst{23-21} = opc1;
	let Inst{7-5} = opc2;
	let Inst{3-0} = CRm;
	let Inst{19-16} = CRn;
	}

	def MCR2 : MovRCopro2<"mcr2", 0 /* from ARM core register to coprocessor */,
	(outs),
	(ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
	c_imm:$CRm, imm0_7:$opc2),
	[(int_arm_mcr2 imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
	imm:$CRm, imm:$opc2)]>,
	Requires<[IsARM,PreV8]>;
	def : ARMInstAlias<"mcr2 $cop, $opc1, $Rt, $CRn, $CRm",
	(MCR2 p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
	c_imm:$CRm, 0)>;
	def MRC2 : MovRCopro2<"mrc2", 1 /* from coprocessor to ARM core register */,
	(outs GPRwithAPSR:$Rt),
	(ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm,
	imm0_7:$opc2), []>,
	Requires<[IsARM,PreV8]>;
	def : ARMInstAlias<"mrc2 $cop, $opc1, $Rt, $CRn, $CRm",
	(MRC2 GPRwithAPSR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
	c_imm:$CRm, 0)>;

	def : ARMV5TPat<(int_arm_mrc2 imm:$cop, imm:$opc1, imm:$CRn,
	imm:$CRm, imm:$opc2),
	(MRC2 imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>;

	class MovRRCopro<string opc, bit direction, dag oops, dag iops, list<dag>
	pattern = []>
	: ABI<0b1100, oops, iops, NoItinerary, opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm",
	pattern> {

	let Inst{23-21} = 0b010;
	let Inst{20} = direction;

	bits<4> Rt;
	bits<4> Rt2;
	bits<4> cop;
	bits<4> opc1;
	bits<4> CRm;

	let Inst{15-12} = Rt;
	let Inst{19-16} = Rt2;
	let Inst{11-8} = cop;
	let Inst{7-4} = opc1;
	let Inst{3-0} = CRm;
	}

	def MCRR : MovRRCopro<"mcrr", 0 /* from ARM core register to coprocessor */,
	(outs), (ins p_imm:$cop, imm0_15:$opc1, GPRnopc:$Rt,
	GPRnopc:$Rt2, c_imm:$CRm),
	[(int_arm_mcrr imm:$cop, imm:$opc1, GPRnopc:$Rt,
	GPRnopc:$Rt2, imm:$CRm)]>;
	def MRRC : MovRRCopro<"mrrc", 1 /* from coprocessor to ARM core register */,
	(outs GPRnopc:$Rt, GPRnopc:$Rt2),
	(ins p_imm:$cop, imm0_15:$opc1, c_imm:$CRm), []>;

	class MovRRCopro2<string opc, bit direction, dag oops, dag iops,
	list<dag> pattern = []>
	: ABXI<0b1100, oops, iops, NoItinerary,
	!strconcat(opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm"), pattern>,
	Requires<[IsARM,PreV8]> {
	let Inst{31-28} = 0b1111;
	let Inst{23-21} = 0b010;
	let Inst{20} = direction;

	bits<4> Rt;
	bits<4> Rt2;
	bits<4> cop;
	bits<4> opc1;
	bits<4> CRm;

	let Inst{15-12} = Rt;
	let Inst{19-16} = Rt2;
	let Inst{11-8} = cop;
	let Inst{7-4} = opc1;
	let Inst{3-0} = CRm;

	let DecoderMethod = "DecoderForMRRC2AndMCRR2";
	}

	def MCRR2 : MovRRCopro2<"mcrr2", 0 /* from ARM core register to coprocessor */,
	(outs), (ins p_imm:$cop, imm0_15:$opc1, GPRnopc:$Rt,
	GPRnopc:$Rt2, c_imm:$CRm),
	[(int_arm_mcrr2 imm:$cop, imm:$opc1, GPRnopc:$Rt,
	GPRnopc:$Rt2, imm:$CRm)]>;

	def MRRC2 : MovRRCopro2<"mrrc2", 1 /* from coprocessor to ARM core register */,
	(outs GPRnopc:$Rt, GPRnopc:$Rt2),
	(ins p_imm:$cop, imm0_15:$opc1, c_imm:$CRm), []>;

	//===----------------------------------------------------------------------===//
	// Move between special register and ARM core register
	//

	// Move to ARM core register from Special Register
	def MRS : ABI<0b0001, (outs GPRnopc:$Rd), (ins), NoItinerary,
	"mrs", "\t$Rd, apsr", []> {
	bits<4> Rd;
	let Inst{23-16} = 0b00001111;
	let Unpredictable{19-17} = 0b111;

	let Inst{15-12} = Rd;

	let Inst{11-0} = 0b000000000000;
	let Unpredictable{11-0} = 0b110100001111;
	}

	def : InstAlias<"mrs${p} $Rd, cpsr", (MRS GPRnopc:$Rd, pred:$p), 0>,
	Requires<[IsARM]>;

	// The MRSsys instruction is the MRS instruction from the ARM ARM,
	// section B9.3.9, with the R bit set to 1.
	def MRSsys : ABI<0b0001, (outs GPRnopc:$Rd), (ins), NoItinerary,
	"mrs", "\t$Rd, spsr", []> {
	bits<4> Rd;
	let Inst{23-16} = 0b01001111;
	let Unpredictable{19-16} = 0b1111;

	let Inst{15-12} = Rd;

	let Inst{11-0} = 0b000000000000;
	let Unpredictable{11-0} = 0b110100001111;
	}

	// However, the MRS (banked register) system instruction (ARMv7VE) does have a
	// separate encoding (distinguished by bit 5.
	def MRSbanked : ABI<0b0001, (outs GPRnopc:$Rd), (ins banked_reg:$banked),
	NoItinerary, "mrs", "\t$Rd, $banked", []>,
	Requires<[IsARM, HasVirtualization]> {
	bits<6> banked;
	bits<4> Rd;

	let Inst{23} = 0;
	let Inst{22} = banked{5}; // R bit
	let Inst{21-20} = 0b00;
	let Inst{19-16} = banked{3-0};
	let Inst{15-12} = Rd;
	let Inst{11-9} = 0b001;
	let Inst{8} = banked{4};
	let Inst{7-0} = 0b00000000;
	}

	// Move from ARM core register to Special Register
	//
	// No need to have both system and application versions of MSR (immediate) or
	// MSR (register), the encodings are the same and the assembly parser has no way
	// to distinguish between them. The mask operand contains the special register
	// (R Bit) in bit 4 and bits 3-0 contains the mask with the fields to be
	// accessed in the special register.
	let Defs = [CPSR] in
	def MSR : ABI<0b0001, (outs), (ins msr_mask:$mask, GPR:$Rn), NoItinerary,
	"msr", "\t$mask, $Rn", []> {
	bits<5> mask;
	bits<4> Rn;

	let Inst{23} = 0;
	let Inst{22} = mask{4}; // R bit
	let Inst{21-20} = 0b10;
	let Inst{19-16} = mask{3-0};
	let Inst{15-12} = 0b1111;
	let Inst{11-4} = 0b00000000;
	let Inst{3-0} = Rn;
	}

	let Defs = [CPSR] in
	def MSRi : ABI<0b0011, (outs), (ins msr_mask:$mask, mod_imm:$imm), NoItinerary,
	"msr", "\t$mask, $imm", []> {
	bits<5> mask;
	bits<12> imm;

	let Inst{23} = 0;
	let Inst{22} = mask{4}; // R bit
	let Inst{21-20} = 0b10;
	let Inst{19-16} = mask{3-0};
	let Inst{15-12} = 0b1111;
	let Inst{11-0} = imm;
	}

	// However, the MSR (banked register) system instruction (ARMv7VE) does have a
	// separate encoding (distinguished by bit 5.
	def MSRbanked : ABI<0b0001, (outs), (ins banked_reg:$banked, GPRnopc:$Rn),
	NoItinerary, "msr", "\t$banked, $Rn", []>,
	Requires<[IsARM, HasVirtualization]> {
	bits<6> banked;
	bits<4> Rn;

	let Inst{23} = 0;
	let Inst{22} = banked{5}; // R bit
	let Inst{21-20} = 0b10;
	let Inst{19-16} = banked{3-0};
	let Inst{15-12} = 0b1111;
	let Inst{11-9} = 0b001;
	let Inst{8} = banked{4};
	let Inst{7-4} = 0b0000;
	let Inst{3-0} = Rn;
	}

	// Dynamic stack allocation yields a _chkstk for Windows targets. These calls
	// are needed to probe the stack when allocating more than
	// 4k bytes in one go. Touching the stack at 4K increments is necessary to
	// ensure that the guard pages used by the OS virtual memory manager are
	// allocated in correct sequence.
	// The main point of having separate instruction are extra unmodelled effects
	// (compared to ordinary calls) like stack pointer change.

	def win__chkstk : SDNode<"ARMISD::WIN__CHKSTK", SDTNone,
	[SDNPHasChain, SDNPSideEffect]>;
	let usesCustomInserter = 1, Uses = [R4], Defs = [R4, SP] in
	def WIN__CHKSTK : PseudoInst<(outs), (ins), NoItinerary, [(win__chkstk)]>;

	def win__dbzchk : SDNode<"ARMISD::WIN__DBZCHK", SDT_WIN__DBZCHK,
	[SDNPHasChain, SDNPSideEffect, SDNPOutGlue]>;
	let usesCustomInserter = 1, Defs = [CPSR] in
	def WIN__DBZCHK : PseudoInst<(outs), (ins tGPR:$divisor), NoItinerary,
	[(win__dbzchk tGPR:$divisor)]>;

	//===----------------------------------------------------------------------===//
	// TLS Instructions
	//

	// __aeabi_read_tp preserves the registers r1-r3.
	// This is a pseudo inst so that we can get the encoding right,
	// complete with fixup for the aeabi_read_tp function.
	// TPsoft is valid for ARM mode only, in case of Thumb mode a tTPsoft pattern
	// is defined in "ARMInstrThumb.td".
	let isCall = 1,
	Defs = [R0, R12, LR, CPSR], Uses = [SP] in {
	def TPsoft : ARMPseudoInst<(outs), (ins), 4, IIC_Br,
	[(set R0, ARMthread_pointer)]>, Sched<[WriteBr]>;
	}

	//===----------------------------------------------------------------------===//
	// SJLJ Exception handling intrinsics
	// eh_sjlj_setjmp() is an instruction sequence to store the return
	// address and save #0 in R0 for the non-longjmp case.
	// Since by its nature we may be coming from some other function to get
	// here, and we're using the stack frame for the containing function to
	// save/restore registers, we can't keep anything live in regs across
	// the eh_sjlj_setjmp(), else it will almost certainly have been tromped upon
	// when we get here from a longjmp(). We force everything out of registers
	// except for our own input by listing the relevant registers in Defs. By
	// doing so, we also cause the prologue/epilogue code to actively preserve
	// all of the callee-saved resgisters, which is exactly what we want.
	// A constant value is passed in $val, and we use the location as a scratch.
	//
	// These are pseudo-instructions and are lowered to individual MC-insts, so
	// no encoding information is necessary.
	let Defs =
	[ R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR, CPSR,
	Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q13, Q14, Q15 ],
	hasSideEffects = 1, isBarrier = 1, usesCustomInserter = 1 in {
	def Int_eh_sjlj_setjmp : PseudoInst<(outs), (ins GPR:$src, GPR:$val),
	NoItinerary,
	[(set R0, (ARMeh_sjlj_setjmp GPR:$src, GPR:$val))]>,
	Requires<[IsARM, HasVFP2]>;
	}

	let Defs =
	[ R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR, CPSR ],
	hasSideEffects = 1, isBarrier = 1, usesCustomInserter = 1 in {
	def Int_eh_sjlj_setjmp_nofp : PseudoInst<(outs), (ins GPR:$src, GPR:$val),
	NoItinerary,
	[(set R0, (ARMeh_sjlj_setjmp GPR:$src, GPR:$val))]>,
	Requires<[IsARM, NoVFP]>;
	}

	// FIXME: Non-IOS version(s)
	let isBarrier = 1, hasSideEffects = 1, isTerminator = 1,
	Defs = [ R7, LR, SP ] in {
	def Int_eh_sjlj_longjmp : PseudoInst<(outs), (ins GPR:$src, GPR:$scratch),
	NoItinerary,
	[(ARMeh_sjlj_longjmp GPR:$src, GPR:$scratch)]>,
	Requires<[IsARM]>;
	}

	let isBarrier = 1, hasSideEffects = 1, usesCustomInserter = 1 in
	def Int_eh_sjlj_setup_dispatch : PseudoInst<(outs), (ins), NoItinerary,
	[(ARMeh_sjlj_setup_dispatch)]>;

	// eh.sjlj.dispatchsetup pseudo-instruction.
	// This pseudo is used for both ARM and Thumb. Any differences are handled when
	// the pseudo is expanded (which happens before any passes that need the
	// instruction size).
	let isBarrier = 1 in
	def Int_eh_sjlj_dispatchsetup : PseudoInst<(outs), (ins), NoItinerary, []>;


	//===----------------------------------------------------------------------===//
	// Non-Instruction Patterns
	//

	// ARMv4 indirect branch using (MOVr PC, dst)
	let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in
	def MOVPCRX : ARMPseudoExpand<(outs), (ins GPR:$dst),
	4, IIC_Br, [(brind GPR:$dst)],
	(MOVr PC, GPR:$dst, (ops 14, zero_reg), zero_reg)>,
	Requires<[IsARM, NoV4T]>, Sched<[WriteBr]>;

	// Large immediate handling.

	// 32-bit immediate using two piece mod_imms or movw + movt.
	// This is a single pseudo instruction, the benefit is that it can be remat'd
	// as a single unit instead of having to handle reg inputs.
	// FIXME: Remove this when we can do generalized remat.
	let isReMaterializable = 1, isMoveImm = 1 in
	def MOVi32imm : PseudoInst<(outs GPR:$dst), (ins i32imm:$src), IIC_iMOVix2,
	[(set GPR:$dst, (arm_i32imm:$src))]>,
	Requires<[IsARM]>;

	def LDRLIT_ga_abs : PseudoInst<(outs GPR:$dst), (ins i32imm:$src), IIC_iLoad_i,
	[(set GPR:$dst, (ARMWrapper tglobaladdr:$src))]>,
	Requires<[IsARM, DontUseMovt]>;

	// Pseudo instruction that combines movw + movt + add pc (if PIC).
	// It also makes it possible to rematerialize the instructions.
	// FIXME: Remove this when we can do generalized remat and when machine licm
	// can properly the instructions.
	let isReMaterializable = 1 in {
	def MOV_ga_pcrel : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr),
	IIC_iMOVix2addpc,
	[(set GPR:$dst, (ARMWrapperPIC tglobaladdr:$addr))]>,
	Requires<[IsARM, UseMovt]>;

	def LDRLIT_ga_pcrel : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr),
	IIC_iLoadiALU,
	[(set GPR:$dst,
	(ARMWrapperPIC tglobaladdr:$addr))]>,
	Requires<[IsARM, DontUseMovt]>;

	let AddedComplexity = 10 in
	def LDRLIT_ga_pcrel_ldr : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr),
	NoItinerary,
	[(set GPR:$dst,
	(load (ARMWrapperPIC tglobaladdr:$addr)))]>,
	Requires<[IsARM, DontUseMovt]>;

	let AddedComplexity = 10 in
	def MOV_ga_pcrel_ldr : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr),
	IIC_iMOVix2ld,
	[(set GPR:$dst, (load (ARMWrapperPIC tglobaladdr:$addr)))]>,
	Requires<[IsARM, UseMovt]>;
	} // isReMaterializable

	// The many different faces of TLS access.
	def : ARMPat<(ARMWrapper tglobaltlsaddr :$dst),
	(MOVi32imm tglobaltlsaddr :$dst)>,
	Requires<[IsARM, UseMovt]>;

	def : Pat<(ARMWrapper tglobaltlsaddr:$src),
	(LDRLIT_ga_abs tglobaltlsaddr:$src)>,
	Requires<[IsARM, DontUseMovt]>;

	def : Pat<(ARMWrapperPIC tglobaltlsaddr:$addr),
	(MOV_ga_pcrel tglobaltlsaddr:$addr)>, Requires<[IsARM, UseMovt]>;

	def : Pat<(ARMWrapperPIC tglobaltlsaddr:$addr),
	(LDRLIT_ga_pcrel tglobaltlsaddr:$addr)>,
	Requires<[IsARM, DontUseMovt]>;
	let AddedComplexity = 10 in
	def : Pat<(load (ARMWrapperPIC tglobaltlsaddr:$addr)),
	(MOV_ga_pcrel_ldr tglobaltlsaddr:$addr)>,
	Requires<[IsARM, UseMovt]>;


	// ConstantPool, GlobalAddress, and JumpTable
	def : ARMPat<(ARMWrapper tconstpool :$dst), (LEApcrel tconstpool :$dst)>;
	def : ARMPat<(ARMWrapper tglobaladdr :$dst), (MOVi32imm tglobaladdr :$dst)>,
	Requires<[IsARM, UseMovt]>;
	def : ARMPat<(ARMWrapper texternalsym :$dst), (MOVi32imm texternalsym :$dst)>,
	Requires<[IsARM, UseMovt]>;
	def : ARMPat<(ARMWrapperJT tjumptable:$dst),
	(LEApcrelJT tjumptable:$dst)>;

	// TODO: add,sub,and, 3-instr forms?

	// Tail calls. These patterns also apply to Thumb mode.
	def : Pat<(ARMtcret tcGPR:$dst), (TCRETURNri tcGPR:$dst)>;
	def : Pat<(ARMtcret (i32 tglobaladdr:$dst)), (TCRETURNdi texternalsym:$dst)>;
	def : Pat<(ARMtcret (i32 texternalsym:$dst)), (TCRETURNdi texternalsym:$dst)>;

	// Direct calls
	def : ARMPat<(ARMcall texternalsym:$func), (BL texternalsym:$func)>;
	def : ARMPat<(ARMcall_nolink texternalsym:$func),
	(BMOVPCB_CALL texternalsym:$func)>;

	// zextload i1 -> zextload i8
	def : ARMPat<(zextloadi1 addrmode_imm12:$addr), (LDRBi12 addrmode_imm12:$addr)>;
	def : ARMPat<(zextloadi1 ldst_so_reg:$addr), (LDRBrs ldst_so_reg:$addr)>;

	// extload -> zextload
	def : ARMPat<(extloadi1 addrmode_imm12:$addr), (LDRBi12 addrmode_imm12:$addr)>;
	def : ARMPat<(extloadi1 ldst_so_reg:$addr), (LDRBrs ldst_so_reg:$addr)>;
	def : ARMPat<(extloadi8 addrmode_imm12:$addr), (LDRBi12 addrmode_imm12:$addr)>;
	def : ARMPat<(extloadi8 ldst_so_reg:$addr), (LDRBrs ldst_so_reg:$addr)>;

	def : ARMPat<(extloadi16 addrmode3:$addr), (LDRH addrmode3:$addr)>;

	def : ARMPat<(extloadi8 addrmodepc:$addr), (PICLDRB addrmodepc:$addr)>;
	def : ARMPat<(extloadi16 addrmodepc:$addr), (PICLDRH addrmodepc:$addr)>;

	// smul* and smla*
	def : ARMV5TEPat<(mul sext_16_node:$a, sext_16_node:$b),
	(SMULBB GPR:$a, GPR:$b)>,
	Sched<[WriteMUL32, ReadMUL, ReadMUL]>;
	def : ARMV5TEPat<(mul sext_16_node:$a, (sra GPR:$b, (i32 16))),
	(SMULBT GPR:$a, GPR:$b)>,
	Sched<[WriteMUL32, ReadMUL, ReadMUL]>;
	def : ARMV5TEPat<(mul (sra GPR:$a, (i32 16)), sext_16_node:$b),
	(SMULTB GPR:$a, GPR:$b)>,
	Sched<[WriteMUL32, ReadMUL, ReadMUL]>;
	def : ARMV5MOPat<(add GPR:$acc,
	(mul sext_16_node:$a, sext_16_node:$b)),
	(SMLABB GPR:$a, GPR:$b, GPR:$acc)>,
	Sched<[WriteMUL32, ReadMUL, ReadMUL]>;
	def : ARMV5MOPat<(add GPR:$acc,
	(mul sext_16_node:$a, (sra GPR:$b, (i32 16)))),
	(SMLABT GPR:$a, GPR:$b, GPR:$acc)>,
	Sched<[WriteMUL32, ReadMUL, ReadMUL]>;
	def : ARMV5MOPat<(add GPR:$acc,
	(mul (sra GPR:$a, (i32 16)), sext_16_node:$b)),
	(SMLATB GPR:$a, GPR:$b, GPR:$acc)>,
	Sched<[WriteMUL32, ReadMUL, ReadMUL]>;

	def : ARMV5TEPat<(int_arm_smulbb GPR:$a, GPR:$b),
	(SMULBB GPR:$a, GPR:$b)>;
	def : ARMV5TEPat<(int_arm_smulbt GPR:$a, GPR:$b),
	(SMULBT GPR:$a, GPR:$b)>;
	def : ARMV5TEPat<(int_arm_smultb GPR:$a, GPR:$b),
	(SMULTB GPR:$a, GPR:$b)>;
	def : ARMV5TEPat<(int_arm_smultt GPR:$a, GPR:$b),
	(SMULTT GPR:$a, GPR:$b)>;
	def : ARMV5TEPat<(int_arm_smulwb GPR:$a, GPR:$b),
	(SMULWB GPR:$a, GPR:$b)>;
	def : ARMV5TEPat<(int_arm_smulwt GPR:$a, GPR:$b),
	(SMULWT GPR:$a, GPR:$b)>;

	def : ARMV5TEPat<(int_arm_smlabb GPR:$a, GPR:$b, GPR:$acc),
	(SMLABB GPR:$a, GPR:$b, GPR:$acc)>;
	def : ARMV5TEPat<(int_arm_smlabt GPR:$a, GPR:$b, GPR:$acc),
	(SMLABT GPR:$a, GPR:$b, GPR:$acc)>;
	def : ARMV5TEPat<(int_arm_smlatb GPR:$a, GPR:$b, GPR:$acc),
	(SMLATB GPR:$a, GPR:$b, GPR:$acc)>;
	def : ARMV5TEPat<(int_arm_smlatt GPR:$a, GPR:$b, GPR:$acc),
	(SMLATT GPR:$a, GPR:$b, GPR:$acc)>;
	def : ARMV5TEPat<(int_arm_smlawb GPR:$a, GPR:$b, GPR:$acc),
	(SMLAWB GPR:$a, GPR:$b, GPR:$acc)>;
	def : ARMV5TEPat<(int_arm_smlawt GPR:$a, GPR:$b, GPR:$acc),
	(SMLAWT GPR:$a, GPR:$b, GPR:$acc)>;

	// Pre-v7 uses MCR for synchronization barriers.
	def : ARMPat<(ARMMemBarrierMCR GPR:$zero), (MCR 15, 0, GPR:$zero, 7, 10, 5)>,
	Requires<[IsARM, HasV6]>;

	// SXT/UXT with no rotate
	let AddedComplexity = 16 in {
	def : ARMV6Pat<(and GPR:$Src, 0x000000FF), (UXTB GPR:$Src, 0)>;
	def : ARMV6Pat<(and GPR:$Src, 0x0000FFFF), (UXTH GPR:$Src, 0)>;
	def : ARMV6Pat<(and GPR:$Src, 0x00FF00FF), (UXTB16 GPR:$Src, 0)>;
	def : ARMV6Pat<(add GPR:$Rn, (and GPR:$Rm, 0x00FF)),
	(UXTAB GPR:$Rn, GPR:$Rm, 0)>;
	def : ARMV6Pat<(add GPR:$Rn, (and GPR:$Rm, 0xFFFF)),
	(UXTAH GPR:$Rn, GPR:$Rm, 0)>;
	}

	def : ARMV6Pat<(sext_inreg GPR:$Src, i8), (SXTB GPR:$Src, 0)>;
	def : ARMV6Pat<(sext_inreg GPR:$Src, i16), (SXTH GPR:$Src, 0)>;

	def : ARMV6Pat<(add GPR:$Rn, (sext_inreg GPRnopc:$Rm, i8)),
	(SXTAB GPR:$Rn, GPRnopc:$Rm, 0)>;
	def : ARMV6Pat<(add GPR:$Rn, (sext_inreg GPRnopc:$Rm, i16)),
	(SXTAH GPR:$Rn, GPRnopc:$Rm, 0)>;

	// Atomic load/store patterns
	def : ARMPat<(atomic_load_8 ldst_so_reg:$src),
	(LDRBrs ldst_so_reg:$src)>;
	def : ARMPat<(atomic_load_8 addrmode_imm12:$src),
	(LDRBi12 addrmode_imm12:$src)>;
	def : ARMPat<(atomic_load_16 addrmode3:$src),
	(LDRH addrmode3:$src)>;
	def : ARMPat<(atomic_load_32 ldst_so_reg:$src),
	(LDRrs ldst_so_reg:$src)>;
	def : ARMPat<(atomic_load_32 addrmode_imm12:$src),
	(LDRi12 addrmode_imm12:$src)>;
	def : ARMPat<(atomic_store_8 ldst_so_reg:$ptr, GPR:$val),
	(STRBrs GPR:$val, ldst_so_reg:$ptr)>;
	def : ARMPat<(atomic_store_8 addrmode_imm12:$ptr, GPR:$val),
	(STRBi12 GPR:$val, addrmode_imm12:$ptr)>;
	def : ARMPat<(atomic_store_16 addrmode3:$ptr, GPR:$val),
	(STRH GPR:$val, addrmode3:$ptr)>;
	def : ARMPat<(atomic_store_32 ldst_so_reg:$ptr, GPR:$val),
	(STRrs GPR:$val, ldst_so_reg:$ptr)>;
	def : ARMPat<(atomic_store_32 addrmode_imm12:$ptr, GPR:$val),
	(STRi12 GPR:$val, addrmode_imm12:$ptr)>;


	//===----------------------------------------------------------------------===//
	// Thumb Support
	//

	include "ARMInstrThumb.td"

	//===----------------------------------------------------------------------===//
	// Thumb2 Support
	//

	include "ARMInstrThumb2.td"

	//===----------------------------------------------------------------------===//
	// Floating Point Support
	//

	include "ARMInstrVFP.td"

	//===----------------------------------------------------------------------===//
	// Advanced SIMD (NEON) Support
	//

	include "ARMInstrNEON.td"

	//===----------------------------------------------------------------------===//
	// Assembler aliases
	//

	// Memory barriers
	def : InstAlias<"dmb", (DMB 0xf), 0>, Requires<[IsARM, HasDB]>;
	def : InstAlias<"dsb", (DSB 0xf), 0>, Requires<[IsARM, HasDB]>;
	def : InstAlias<"isb", (ISB 0xf), 0>, Requires<[IsARM, HasDB]>;

	// System instructions
	def : MnemonicAlias<"swi", "svc">;

	// Load / Store Multiple
	def : MnemonicAlias<"ldmfd", "ldm">;
	def : MnemonicAlias<"ldmia", "ldm">;
	def : MnemonicAlias<"ldmea", "ldmdb">;
	def : MnemonicAlias<"stmfd", "stmdb">;
	def : MnemonicAlias<"stmia", "stm">;
	def : MnemonicAlias<"stmea", "stm">;

	// PKHBT/PKHTB with default shift amount. PKHTB is equivalent to PKHBT with the
	// input operands swapped when the shift amount is zero (i.e., unspecified).
	def : InstAlias<"pkhbt${p} $Rd, $Rn, $Rm",
	(PKHBT GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, 0, pred:$p), 0>,
	Requires<[IsARM, HasV6]>;
	def : InstAlias<"pkhtb${p} $Rd, $Rn, $Rm",
	(PKHBT GPRnopc:$Rd, GPRnopc:$Rm, GPRnopc:$Rn, 0, pred:$p), 0>,
	Requires<[IsARM, HasV6]>;

	// PUSH/POP aliases for STM/LDM
	def : ARMInstAlias<"push${p} $regs", (STMDB_UPD SP, pred:$p, reglist:$regs)>;
	def : ARMInstAlias<"pop${p} $regs", (LDMIA_UPD SP, pred:$p, reglist:$regs)>;

	// SSAT/USAT optional shift operand.
	def : ARMInstAlias<"ssat${p} $Rd, $sat_imm, $Rn",
	(SSAT GPRnopc:$Rd, imm1_32:$sat_imm, GPRnopc:$Rn, 0, pred:$p)>;
	def : ARMInstAlias<"usat${p} $Rd, $sat_imm, $Rn",
	(USAT GPRnopc:$Rd, imm0_31:$sat_imm, GPRnopc:$Rn, 0, pred:$p)>;


	// Extend instruction optional rotate operand.
	def : ARMInstAlias<"sxtab${p} $Rd, $Rn, $Rm",
	(SXTAB GPRnopc:$Rd, GPR:$Rn, GPRnopc:$Rm, 0, pred:$p)>;
	def : ARMInstAlias<"sxtah${p} $Rd, $Rn, $Rm",
	(SXTAH GPRnopc:$Rd, GPR:$Rn, GPRnopc:$Rm, 0, pred:$p)>;
	def : ARMInstAlias<"sxtab16${p} $Rd, $Rn, $Rm",
	(SXTAB16 GPRnopc:$Rd, GPR:$Rn, GPRnopc:$Rm, 0, pred:$p)>;
	def : ARMInstAlias<"sxtb${p} $Rd, $Rm",
	(SXTB GPRnopc:$Rd, GPRnopc:$Rm, 0, pred:$p)>;
	def : ARMInstAlias<"sxtb16${p} $Rd, $Rm",
	(SXTB16 GPRnopc:$Rd, GPRnopc:$Rm, 0, pred:$p)>;
	def : ARMInstAlias<"sxth${p} $Rd, $Rm",
	(SXTH GPRnopc:$Rd, GPRnopc:$Rm, 0, pred:$p)>;

	def : ARMInstAlias<"uxtab${p} $Rd, $Rn, $Rm",
	(UXTAB GPRnopc:$Rd, GPR:$Rn, GPRnopc:$Rm, 0, pred:$p)>;
	def : ARMInstAlias<"uxtah${p} $Rd, $Rn, $Rm",
	(UXTAH GPRnopc:$Rd, GPR:$Rn, GPRnopc:$Rm, 0, pred:$p)>;
	def : ARMInstAlias<"uxtab16${p} $Rd, $Rn, $Rm",
	(UXTAB16 GPRnopc:$Rd, GPR:$Rn, GPRnopc:$Rm, 0, pred:$p)>;
	def : ARMInstAlias<"uxtb${p} $Rd, $Rm",
	(UXTB GPRnopc:$Rd, GPRnopc:$Rm, 0, pred:$p)>;
	def : ARMInstAlias<"uxtb16${p} $Rd, $Rm",
	(UXTB16 GPRnopc:$Rd, GPRnopc:$Rm, 0, pred:$p)>;
	def : ARMInstAlias<"uxth${p} $Rd, $Rm",
	(UXTH GPRnopc:$Rd, GPRnopc:$Rm, 0, pred:$p)>;


	// RFE aliases
	def : MnemonicAlias<"rfefa", "rfeda">;
	def : MnemonicAlias<"rfeea", "rfedb">;
	def : MnemonicAlias<"rfefd", "rfeia">;
	def : MnemonicAlias<"rfeed", "rfeib">;
	def : MnemonicAlias<"rfe", "rfeia">;

	// SRS aliases
	def : MnemonicAlias<"srsfa", "srsib">;
	def : MnemonicAlias<"srsea", "srsia">;
	def : MnemonicAlias<"srsfd", "srsdb">;
	def : MnemonicAlias<"srsed", "srsda">;
	def : MnemonicAlias<"srs", "srsia">;

	// QSAX == QSUBADDX
	def : MnemonicAlias<"qsubaddx", "qsax">;
	// SASX == SADDSUBX
	def : MnemonicAlias<"saddsubx", "sasx">;
	// SHASX == SHADDSUBX
	def : MnemonicAlias<"shaddsubx", "shasx">;
	// SHSAX == SHSUBADDX
	def : MnemonicAlias<"shsubaddx", "shsax">;
	// SSAX == SSUBADDX
	def : MnemonicAlias<"ssubaddx", "ssax">;
	// UASX == UADDSUBX
	def : MnemonicAlias<"uaddsubx", "uasx">;
	// UHASX == UHADDSUBX
	def : MnemonicAlias<"uhaddsubx", "uhasx">;
	// UHSAX == UHSUBADDX
	def : MnemonicAlias<"uhsubaddx", "uhsax">;
	// UQASX == UQADDSUBX
	def : MnemonicAlias<"uqaddsubx", "uqasx">;
	// UQSAX == UQSUBADDX
	def : MnemonicAlias<"uqsubaddx", "uqsax">;
	// USAX == USUBADDX
	def : MnemonicAlias<"usubaddx", "usax">;

	// "mov Rd, mod_imm_not" can be handled via "mvn" in assembly, just like
	// for isel.
	def : ARMInstSubst<"mov${s}${p} $Rd, $imm",
	(MVNi rGPR:$Rd, mod_imm_not:$imm, pred:$p, cc_out:$s)>;
	def : ARMInstSubst<"mvn${s}${p} $Rd, $imm",
	(MOVi rGPR:$Rd, mod_imm_not:$imm, pred:$p, cc_out:$s)>;
	// Same for AND <--> BIC
	def : ARMInstSubst<"bic${s}${p} $Rd, $Rn, $imm",
	(ANDri GPR:$Rd, GPR:$Rn, mod_imm_not:$imm,
	pred:$p, cc_out:$s)>;
	def : ARMInstSubst<"bic${s}${p} $Rdn, $imm",
	(ANDri GPR:$Rdn, GPR:$Rdn, mod_imm_not:$imm,
	pred:$p, cc_out:$s)>;
	def : ARMInstSubst<"and${s}${p} $Rd, $Rn, $imm",
	(BICri GPR:$Rd, GPR:$Rn, mod_imm_not:$imm,
	pred:$p, cc_out:$s)>;
	def : ARMInstSubst<"and${s}${p} $Rdn, $imm",
	(BICri GPR:$Rdn, GPR:$Rdn, mod_imm_not:$imm,
	pred:$p, cc_out:$s)>;

	// Likewise, "add Rd, mod_imm_neg" -> sub
	def : ARMInstSubst<"add${s}${p} $Rd, $Rn, $imm",
	(SUBri GPR:$Rd, GPR:$Rn, mod_imm_neg:$imm, pred:$p, cc_out:$s)>;
	def : ARMInstSubst<"add${s}${p} $Rd, $imm",
	(SUBri GPR:$Rd, GPR:$Rd, mod_imm_neg:$imm, pred:$p, cc_out:$s)>;
	// Likewise, "sub Rd, mod_imm_neg" -> add
	def : ARMInstSubst<"sub${s}${p} $Rd, $Rn, $imm",
	(ADDri GPR:$Rd, GPR:$Rn, mod_imm_neg:$imm, pred:$p, cc_out:$s)>;
	def : ARMInstSubst<"sub${s}${p} $Rd, $imm",
	(ADDri GPR:$Rd, GPR:$Rd, mod_imm_neg:$imm, pred:$p, cc_out:$s)>;


	def : ARMInstSubst<"adc${s}${p} $Rd, $Rn, $imm",
	(SBCri GPR:$Rd, GPR:$Rn, mod_imm_not:$imm, pred:$p, cc_out:$s)>;
	def : ARMInstSubst<"adc${s}${p} $Rdn, $imm",
	(SBCri GPR:$Rdn, GPR:$Rdn, mod_imm_not:$imm, pred:$p, cc_out:$s)>;
	def : ARMInstSubst<"sbc${s}${p} $Rd, $Rn, $imm",
	(ADCri GPR:$Rd, GPR:$Rn, mod_imm_not:$imm, pred:$p, cc_out:$s)>;
	def : ARMInstSubst<"sbc${s}${p} $Rdn, $imm",
	(ADCri GPR:$Rdn, GPR:$Rdn, mod_imm_not:$imm, pred:$p, cc_out:$s)>;

	// Same for CMP <--> CMN via mod_imm_neg
	def : ARMInstSubst<"cmp${p} $Rd, $imm",
	(CMNri rGPR:$Rd, mod_imm_neg:$imm, pred:$p)>;
	def : ARMInstSubst<"cmn${p} $Rd, $imm",
	(CMPri rGPR:$Rd, mod_imm_neg:$imm, pred:$p)>;

	// The shifter forms of the MOV instruction are aliased to the ASR, LSL,
	// LSR, ROR, and RRX instructions.
	// FIXME: We need C++ parser hooks to map the alias to the MOV
	// encoding. It seems we should be able to do that sort of thing
	// in tblgen, but it could get ugly.
	let TwoOperandAliasConstraint = "$Rm = $Rd" in {
	def ASRi : ARMAsmPseudo<"asr${s}${p} $Rd, $Rm, $imm",
	(ins GPR:$Rd, GPR:$Rm, imm0_32:$imm, pred:$p,
	cc_out:$s)>;
	def LSRi : ARMAsmPseudo<"lsr${s}${p} $Rd, $Rm, $imm",
	(ins GPR:$Rd, GPR:$Rm, imm0_32:$imm, pred:$p,
	cc_out:$s)>;
	def LSLi : ARMAsmPseudo<"lsl${s}${p} $Rd, $Rm, $imm",
	(ins GPR:$Rd, GPR:$Rm, imm0_31:$imm, pred:$p,
	cc_out:$s)>;
	def RORi : ARMAsmPseudo<"ror${s}${p} $Rd, $Rm, $imm",
	(ins GPR:$Rd, GPR:$Rm, imm0_31:$imm, pred:$p,
	cc_out:$s)>;
	}
	def RRXi : ARMAsmPseudo<"rrx${s}${p} $Rd, $Rm",
	(ins GPR:$Rd, GPR:$Rm, pred:$p, cc_out:$s)>;
	let TwoOperandAliasConstraint = "$Rn = $Rd" in {
	def ASRr : ARMAsmPseudo<"asr${s}${p} $Rd, $Rn, $Rm",
	(ins GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p,
	cc_out:$s)>;
	def LSRr : ARMAsmPseudo<"lsr${s}${p} $Rd, $Rn, $Rm",
	(ins GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p,
	cc_out:$s)>;
	def LSLr : ARMAsmPseudo<"lsl${s}${p} $Rd, $Rn, $Rm",
	(ins GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p,
	cc_out:$s)>;
	def RORr : ARMAsmPseudo<"ror${s}${p} $Rd, $Rn, $Rm",
	(ins GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p,
	cc_out:$s)>;
	}

	// "neg" is and alias for "rsb rd, rn, #0"
	def : ARMInstAlias<"neg${s}${p} $Rd, $Rm",
	(RSBri GPR:$Rd, GPR:$Rm, 0, pred:$p, cc_out:$s)>;

	// Pre-v6, 'mov r0, r0' was used as a NOP encoding.
	def : InstAlias<"nop${p}", (MOVr R0, R0, pred:$p, zero_reg)>,
	Requires<[IsARM, NoV6]>;

	// MUL/UMLAL/SMLAL/UMULL/SMULL are available on all arches, but
	// the instruction definitions need difference constraints pre-v6.
	// Use these aliases for the assembly parsing on pre-v6.
	def : InstAlias<"mul${s}${p} $Rd, $Rn, $Rm",
	(MUL GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p, cc_out:$s), 0>,
	Requires<[IsARM, NoV6]>;
	def : InstAlias<"mla${s}${p} $Rd, $Rn, $Rm, $Ra",
	(MLA GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$Ra,
	pred:$p, cc_out:$s), 0>,
	Requires<[IsARM, NoV6]>;
	def : InstAlias<"smlal${s}${p} $RdLo, $RdHi, $Rn, $Rm",
	(SMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s), 0>,
	Requires<[IsARM, NoV6]>;
	def : InstAlias<"umlal${s}${p} $RdLo, $RdHi, $Rn, $Rm",
	(UMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s), 0>,
	Requires<[IsARM, NoV6]>;
	def : InstAlias<"smull${s}${p} $RdLo, $RdHi, $Rn, $Rm",
	(SMULL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s), 0>,
	Requires<[IsARM, NoV6]>;
	def : InstAlias<"umull${s}${p} $RdLo, $RdHi, $Rn, $Rm",
	(UMULL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s), 0>,
	Requires<[IsARM, NoV6]>;

	// 'it' blocks in ARM mode just validate the predicates. The IT itself
	// is discarded.
	def ITasm : ARMAsmPseudo<"it$mask $cc", (ins it_pred:$cc, it_mask:$mask)>,
	ComplexDeprecationPredicate<"IT">;

	let mayLoad = 1, mayStore =1, hasSideEffects = 1 in
	def SPACE : PseudoInst<(outs GPR:$Rd), (ins i32imm:$size, GPR:$Rn),
	NoItinerary,
	[(set GPR:$Rd, (int_arm_space imm:$size, GPR:$Rn))]>;

	//===----------------------------------
	// Atomic cmpxchg for -O0
	//===----------------------------------

	// The fast register allocator used during -O0 inserts spills to cover any VRegs
	// live across basic block boundaries. When this happens between an LDXR and an
	// STXR it can clear the exclusive monitor, causing all cmpxchg attempts to
	// fail.

	// Unfortunately, this means we have to have an alternative (expanded
	// post-regalloc) path for -O0 compilations. Fortunately this path can be
	// significantly more naive than the standard expansion: we conservatively
	// assume seq_cst, strong cmpxchg and omit clrex on failure.

	-let Constraints = "@earlyclobber $Rd,@earlyclobber $status",
	+let Constraints = "@earlyclobber $Rd,@earlyclobber $temp",
	mayLoad = 1, mayStore = 1 in {
	-def CMP_SWAP_8 : PseudoInst<(outs GPR:$Rd, GPR:$status),
	+def CMP_SWAP_8 : PseudoInst<(outs GPR:$Rd, GPR:$temp),
	(ins GPR:$addr, GPR:$desired, GPR:$new),
	NoItinerary, []>, Sched<[]>;

	-def CMP_SWAP_16 : PseudoInst<(outs GPR:$Rd, GPR:$status),
	+def CMP_SWAP_16 : PseudoInst<(outs GPR:$Rd, GPR:$temp),
	(ins GPR:$addr, GPR:$desired, GPR:$new),
	NoItinerary, []>, Sched<[]>;

	-def CMP_SWAP_32 : PseudoInst<(outs GPR:$Rd, GPR:$status),
	+def CMP_SWAP_32 : PseudoInst<(outs GPR:$Rd, GPR:$temp),
	(ins GPR:$addr, GPR:$desired, GPR:$new),
	NoItinerary, []>, Sched<[]>;

	-def CMP_SWAP_64 : PseudoInst<(outs GPRPair:$Rd, GPR:$status),
	+def CMP_SWAP_64 : PseudoInst<(outs GPRPair:$Rd, GPR:$temp),
	(ins GPR:$addr, GPRPair:$desired, GPRPair:$new),
	NoItinerary, []>, Sched<[]>;
	}

	def CompilerBarrier : PseudoInst<(outs), (ins i32imm:$ordering), NoItinerary,
	[(atomic_fence imm:$ordering, 0)]> {
	let hasSideEffects = 1;
	let Size = 0;
	let AsmString = "@ COMPILER BARRIER";
	}
	diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
	index 7563bffd8f87..1e73122cdc38 100644
	--- a/lib/Target/X86/X86ISelLowering.cpp
	+++ b/lib/Target/X86/X86ISelLowering.cpp
	@@ -1,36712 +1,36742 @@
	//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines the interfaces that X86 uses to lower LLVM code into a
	// selection DAG.
	//
	//===----------------------------------------------------------------------===//

	#include "X86ISelLowering.h"
	#include "Utils/X86ShuffleDecode.h"
	#include "X86CallingConv.h"
	#include "X86FrameLowering.h"
	#include "X86InstrBuilder.h"
	#include "X86IntrinsicsInfo.h"
	#include "X86MachineFunctionInfo.h"
	#include "X86ShuffleDecodeConstantPool.h"
	#include "X86TargetMachine.h"
	#include "X86TargetObjectFile.h"
	#include "llvm/ADT/SmallBitVector.h"
	#include "llvm/ADT/SmallSet.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/ADT/StringExtras.h"
	#include "llvm/ADT/StringSwitch.h"
	#include "llvm/Analysis/EHPersonalities.h"
	#include "llvm/CodeGen/IntrinsicLowering.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineJumpTableInfo.h"
	#include "llvm/CodeGen/MachineModuleInfo.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/WinEHFuncInfo.h"
	#include "llvm/IR/CallSite.h"
	#include "llvm/IR/CallingConv.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/DiagnosticInfo.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/GlobalAlias.h"
	#include "llvm/IR/GlobalVariable.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/Intrinsics.h"
	#include "llvm/MC/MCAsmInfo.h"
	#include "llvm/MC/MCContext.h"
	#include "llvm/MC/MCExpr.h"
	#include "llvm/MC/MCSymbol.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/KnownBits.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Target/TargetLowering.h"
	#include "llvm/Target/TargetOptions.h"
	#include <algorithm>
	#include <bitset>
	#include <cctype>
	#include <numeric>
	using namespace llvm;

	#define DEBUG_TYPE "x86-isel"

	STATISTIC(NumTailCalls, "Number of tail calls");

	static cl::opt<bool> ExperimentalVectorWideningLegalization(
	"x86-experimental-vector-widening-legalization", cl::init(false),
	cl::desc("Enable an experimental vector type legalization through widening "
	"rather than promotion."),
	cl::Hidden);

	static cl::opt<int> ExperimentalPrefLoopAlignment(
	"x86-experimental-pref-loop-alignment", cl::init(4),
	cl::desc("Sets the preferable loop alignment for experiments "
	"(the last x86-experimental-pref-loop-alignment bits"
	" of the loop header PC will be 0)."),
	cl::Hidden);

	static cl::opt<bool> MulConstantOptimization(
	"mul-constant-optimization", cl::init(true),
	cl::desc("Replace 'mul x, Const' with more effective instructions like "
	"SHIFT, LEA, etc."),
	cl::Hidden);

	/// Call this when the user attempts to do something unsupported, like
	/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
	/// report_fatal_error, so calling code should attempt to recover without
	/// crashing.
	static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
	const char *Msg) {
	MachineFunction &MF = DAG.getMachineFunction();
	DAG.getContext()->diagnose(
	DiagnosticInfoUnsupported(*MF.getFunction(), Msg, dl.getDebugLoc()));
	}

	X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
	const X86Subtarget &STI)
	: TargetLowering(TM), Subtarget(STI) {
	bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
	X86ScalarSSEf64 = Subtarget.hasSSE2();
	X86ScalarSSEf32 = Subtarget.hasSSE1();
	MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());

	// Set up the TargetLowering object.

	// X86 is weird. It always uses i8 for shift amounts and setcc results.
	setBooleanContents(ZeroOrOneBooleanContent);
	// X86-SSE is even stranger. It uses -1 or 0 for vector masks.
	setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);

	// For 64-bit, since we have so many registers, use the ILP scheduler.
	// For 32-bit, use the register pressure specific scheduling.
	// For Atom, always use ILP scheduling.
	if (Subtarget.isAtom())
	setSchedulingPreference(Sched::ILP);
	else if (Subtarget.is64Bit())
	setSchedulingPreference(Sched::ILP);
	else
	setSchedulingPreference(Sched::RegPressure);
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());

	// Bypass expensive divides and use cheaper ones.
	if (TM.getOptLevel() >= CodeGenOpt::Default) {
	if (Subtarget.hasSlowDivide32())
	addBypassSlowDiv(32, 8);
	if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
	addBypassSlowDiv(64, 32);
	}

	if (Subtarget.isTargetKnownWindowsMSVC() \|\|
	Subtarget.isTargetWindowsItanium()) {
	// Setup Windows compiler runtime calls.
	setLibcallName(RTLIB::SDIV_I64, "_alldiv");
	setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
	setLibcallName(RTLIB::SREM_I64, "_allrem");
	setLibcallName(RTLIB::UREM_I64, "_aullrem");
	setLibcallName(RTLIB::MUL_I64, "_allmul");
	setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
	setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
	setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
	setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
	setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
	}

	if (Subtarget.isTargetDarwin()) {
	// Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
	setUseUnderscoreSetJmp(false);
	setUseUnderscoreLongJmp(false);
	} else if (Subtarget.isTargetWindowsGNU()) {
	// MS runtime is weird: it exports _setjmp, but longjmp!
	setUseUnderscoreSetJmp(true);
	setUseUnderscoreLongJmp(false);
	} else {
	setUseUnderscoreSetJmp(true);
	setUseUnderscoreLongJmp(true);
	}

	// Set up the register classes.
	addRegisterClass(MVT::i8, &X86::GR8RegClass);
	addRegisterClass(MVT::i16, &X86::GR16RegClass);
	addRegisterClass(MVT::i32, &X86::GR32RegClass);
	if (Subtarget.is64Bit())
	addRegisterClass(MVT::i64, &X86::GR64RegClass);

	for (MVT VT : MVT::integer_valuetypes())
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);

	// We don't accept any truncstore of integer registers.
	setTruncStoreAction(MVT::i64, MVT::i32, Expand);
	setTruncStoreAction(MVT::i64, MVT::i16, Expand);
	setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
	setTruncStoreAction(MVT::i32, MVT::i16, Expand);
	setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
	setTruncStoreAction(MVT::i16, MVT::i8, Expand);

	setTruncStoreAction(MVT::f64, MVT::f32, Expand);

	// SETOEQ and SETUNE require checking two conditions.
	setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
	setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
	setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
	setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
	setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
	setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);

	// Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
	// operation.
	setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);
	setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);
	setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);

	if (Subtarget.is64Bit()) {
	if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())
	// f32/f64 are legal, f80 is custom.
	setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
	else
	setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
	setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
	} else if (!Subtarget.useSoftFloat()) {
	// We have an algorithm for SSE2->double, and we turn this into a
	// 64-bit FILD followed by conditional FADD for other targets.
	setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
	// We have an algorithm for SSE2, and we turn this into a 64-bit
	// FILD or VCVTUSI2SS/SD for other targets.
	setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
	}

	// Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
	// this operation.
	setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);
	setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);

	if (!Subtarget.useSoftFloat()) {
	// SSE has no i16 to fp conversion, only i32.
	if (X86ScalarSSEf32) {
	setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
	// f32 and f64 cases are Legal, f80 case is not
	setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
	} else {
	setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom);
	setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
	}
	} else {
	setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
	setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote);
	}

	// Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
	// this operation.
	setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote);
	setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote);

	if (!Subtarget.useSoftFloat()) {
	// In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
	// are Legal, f80 is custom lowered.
	setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
	setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);

	if (X86ScalarSSEf32) {
	setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
	// f32 and f64 cases are Legal, f80 case is not
	setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
	} else {
	setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
	setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
	}
	} else {
	setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
	setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand);
	setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand);
	}

	// Handle FP_TO_UINT by promoting the destination to a larger signed
	// conversion.
	setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote);
	setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);
	setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);

	if (Subtarget.is64Bit()) {
	if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
	// FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
	setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
	setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
	} else {
	setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
	setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
	}
	} else if (!Subtarget.useSoftFloat()) {
	// Since AVX is a superset of SSE3, only check for SSE here.
	if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())
	// Expand FP_TO_UINT into a select.
	// FIXME: We would like to use a Custom expander here eventually to do
	// the optimal thing for SSE vs. the default expansion in the legalizer.
	setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand);
	else
	// With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
	// With SSE3 we can use fisttpll to convert to a signed i64; without
	// SSE, we're stuck with a fistpll.
	setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);

	setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
	}

	// TODO: when we have SSE, these could be more efficient, by using movd/movq.
	if (!X86ScalarSSEf64) {
	setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
	setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
	if (Subtarget.is64Bit()) {
	setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
	// Without SSE, i64->f64 goes through memory.
	setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
	}
	} else if (!Subtarget.is64Bit())
	setOperationAction(ISD::BITCAST , MVT::i64 , Custom);

	// Scalar integer divide and remainder are lowered to use operations that
	// produce two results, to match the available instructions. This exposes
	// the two-result form to trivial CSE, which is able to combine x/y and x%y
	// into a single instruction.
	//
	// Scalar integer multiply-high is also lowered to use two-result
	// operations, to match the available instructions. However, plain multiply
	// (low) operations are left as Legal, as there are single-result
	// instructions for this in x86. Using the two-result multiply instructions
	// when both high and low results are needed must be arranged by dagcombine.
	for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
	setOperationAction(ISD::MULHS, VT, Expand);
	setOperationAction(ISD::MULHU, VT, Expand);
	setOperationAction(ISD::SDIV, VT, Expand);
	setOperationAction(ISD::UDIV, VT, Expand);
	setOperationAction(ISD::SREM, VT, Expand);
	setOperationAction(ISD::UREM, VT, Expand);
	}

	setOperationAction(ISD::BR_JT , MVT::Other, Expand);
	setOperationAction(ISD::BRCOND , MVT::Other, Custom);
	for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
	MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
	setOperationAction(ISD::BR_CC, VT, Expand);
	setOperationAction(ISD::SELECT_CC, VT, Expand);
	}
	if (Subtarget.is64Bit())
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
	setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand);

	setOperationAction(ISD::FREM , MVT::f32 , Expand);
	setOperationAction(ISD::FREM , MVT::f64 , Expand);
	setOperationAction(ISD::FREM , MVT::f80 , Expand);
	setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);

	// Promote the i8 variants and force them on up to i32 which has a shorter
	// encoding.
	setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
	setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
	if (!Subtarget.hasBMI()) {
	setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
	setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
	setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);
	setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
	if (Subtarget.is64Bit()) {
	setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
	setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
	}
	}

	if (Subtarget.hasLZCNT()) {
	// When promoting the i8 variants, force them to i32 for a shorter
	// encoding.
	setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
	setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
	} else {
	setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
	setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
	setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
	setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom);
	setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom);
	setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom);
	if (Subtarget.is64Bit()) {
	setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
	setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
	}
	}

	// Special handling for half-precision floating point conversions.
	// If we don't have F16C support, then lower half float conversions
	// into library calls.
	if (Subtarget.useSoftFloat() \|\|
	(!Subtarget.hasF16C() && !Subtarget.hasAVX512())) {
	setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
	setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
	}

	// There's never any support for operations beyond MVT::f32.
	setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
	setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
	setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
	setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);

	setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
	setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
	setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
	setTruncStoreAction(MVT::f32, MVT::f16, Expand);
	setTruncStoreAction(MVT::f64, MVT::f16, Expand);
	setTruncStoreAction(MVT::f80, MVT::f16, Expand);

	if (Subtarget.hasPOPCNT()) {
	setOperationAction(ISD::CTPOP , MVT::i8 , Promote);
	} else {
	setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
	setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
	setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
	if (Subtarget.is64Bit())
	setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
	}

	setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);

	if (!Subtarget.hasMOVBE())
	setOperationAction(ISD::BSWAP , MVT::i16 , Expand);

	// These should be promoted to a larger select which is supported.
	setOperationAction(ISD::SELECT , MVT::i1 , Promote);
	// X86 wants to expand cmov itself.
	for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
	setOperationAction(ISD::SELECT, VT, Custom);
	setOperationAction(ISD::SETCC, VT, Custom);
	}
	for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
	if (VT == MVT::i64 && !Subtarget.is64Bit())
	continue;
	setOperationAction(ISD::SELECT, VT, Custom);
	setOperationAction(ISD::SETCC, VT, Custom);
	}
	+
	+ // Custom action for SELECT MMX and expand action for SELECT_CC MMX
	+ setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
	+ setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
	+
	setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
	// NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
	// SjLj exception handling but a light-weight setjmp/longjmp replacement to
	// support continuation, user-level threading, and etc.. As a result, no
	// other SjLj exception interfaces are implemented and please don't build
	// your own exception handling based on them.
	// LLVM/Clang supports zero-cost DWARF exception handling.
	setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
	setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
	setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
	if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
	setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");

	// Darwin ABI issue.
	for (auto VT : { MVT::i32, MVT::i64 }) {
	if (VT == MVT::i64 && !Subtarget.is64Bit())
	continue;
	setOperationAction(ISD::ConstantPool , VT, Custom);
	setOperationAction(ISD::JumpTable , VT, Custom);
	setOperationAction(ISD::GlobalAddress , VT, Custom);
	setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
	setOperationAction(ISD::ExternalSymbol , VT, Custom);
	setOperationAction(ISD::BlockAddress , VT, Custom);
	}

	// 64-bit shl, sra, srl (iff 32-bit x86)
	for (auto VT : { MVT::i32, MVT::i64 }) {
	if (VT == MVT::i64 && !Subtarget.is64Bit())
	continue;
	setOperationAction(ISD::SHL_PARTS, VT, Custom);
	setOperationAction(ISD::SRA_PARTS, VT, Custom);
	setOperationAction(ISD::SRL_PARTS, VT, Custom);
	}

	if (Subtarget.hasSSE1())
	setOperationAction(ISD::PREFETCH , MVT::Other, Legal);

	setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);

	// Expand certain atomics
	for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
	setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
	setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
	}

	if (Subtarget.hasCmpxchg16b()) {
	setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
	}

	// FIXME - use subtarget debug flags
	if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
	!Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
	TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
	setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
	}

	setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
	setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);

	setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
	setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);

	setOperationAction(ISD::TRAP, MVT::Other, Legal);
	setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);

	// VASTART needs to be custom lowered to use the VarArgsFrameIndex
	setOperationAction(ISD::VASTART , MVT::Other, Custom);
	setOperationAction(ISD::VAEND , MVT::Other, Expand);
	bool Is64Bit = Subtarget.is64Bit();
	setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
	setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);

	setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
	setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);

	setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);

	// GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
	setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
	setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);

	if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
	// f32 and f64 use SSE.
	// Set up the FP register classes.
	addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
	: &X86::FR32RegClass);
	addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
	: &X86::FR64RegClass);

	for (auto VT : { MVT::f32, MVT::f64 }) {
	// Use ANDPD to simulate FABS.
	setOperationAction(ISD::FABS, VT, Custom);

	// Use XORP to simulate FNEG.
	setOperationAction(ISD::FNEG, VT, Custom);

	// Use ANDPD and ORPD to simulate FCOPYSIGN.
	setOperationAction(ISD::FCOPYSIGN, VT, Custom);

	// We don't support sin/cos/fmod
	setOperationAction(ISD::FSIN , VT, Expand);
	setOperationAction(ISD::FCOS , VT, Expand);
	setOperationAction(ISD::FSINCOS, VT, Expand);
	}

	// Lower this to MOVMSK plus an AND.
	setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
	setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);

	// Expand FP immediates into loads from the stack, except for the special
	// cases we handle.
	addLegalFPImmediate(APFloat(+0.0)); // xorpd
	addLegalFPImmediate(APFloat(+0.0f)); // xorps
	} else if (UseX87 && X86ScalarSSEf32) {
	// Use SSE for f32, x87 for f64.
	// Set up the FP register classes.
	addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
	: &X86::FR32RegClass);
	addRegisterClass(MVT::f64, &X86::RFP64RegClass);

	// Use ANDPS to simulate FABS.
	setOperationAction(ISD::FABS , MVT::f32, Custom);

	// Use XORP to simulate FNEG.
	setOperationAction(ISD::FNEG , MVT::f32, Custom);

	setOperationAction(ISD::UNDEF, MVT::f64, Expand);

	// Use ANDPS and ORPS to simulate FCOPYSIGN.
	setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
	setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);

	// We don't support sin/cos/fmod
	setOperationAction(ISD::FSIN , MVT::f32, Expand);
	setOperationAction(ISD::FCOS , MVT::f32, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f32, Expand);

	// Special cases we handle for FP constants.
	addLegalFPImmediate(APFloat(+0.0f)); // xorps
	addLegalFPImmediate(APFloat(+0.0)); // FLD0
	addLegalFPImmediate(APFloat(+1.0)); // FLD1
	addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
	addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS

	if (!TM.Options.UnsafeFPMath) {
	setOperationAction(ISD::FSIN , MVT::f64, Expand);
	setOperationAction(ISD::FCOS , MVT::f64, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
	}
	} else if (UseX87) {
	// f32 and f64 in x87.
	// Set up the FP register classes.
	addRegisterClass(MVT::f64, &X86::RFP64RegClass);
	addRegisterClass(MVT::f32, &X86::RFP32RegClass);

	for (auto VT : { MVT::f32, MVT::f64 }) {
	setOperationAction(ISD::UNDEF, VT, Expand);
	setOperationAction(ISD::FCOPYSIGN, VT, Expand);

	if (!TM.Options.UnsafeFPMath) {
	setOperationAction(ISD::FSIN , VT, Expand);
	setOperationAction(ISD::FCOS , VT, Expand);
	setOperationAction(ISD::FSINCOS, VT, Expand);
	}
	}
	addLegalFPImmediate(APFloat(+0.0)); // FLD0
	addLegalFPImmediate(APFloat(+1.0)); // FLD1
	addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
	addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
	addLegalFPImmediate(APFloat(+0.0f)); // FLD0
	addLegalFPImmediate(APFloat(+1.0f)); // FLD1
	addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
	addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
	}

	// We don't support FMA.
	setOperationAction(ISD::FMA, MVT::f64, Expand);
	setOperationAction(ISD::FMA, MVT::f32, Expand);

	// Long double always uses X87, except f128 in MMX.
	if (UseX87) {
	if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
	addRegisterClass(MVT::f128, &X86::FR128RegClass);
	ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
	setOperationAction(ISD::FABS , MVT::f128, Custom);
	setOperationAction(ISD::FNEG , MVT::f128, Custom);
	setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
	}

	addRegisterClass(MVT::f80, &X86::RFP80RegClass);
	setOperationAction(ISD::UNDEF, MVT::f80, Expand);
	setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
	{
	APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
	addLegalFPImmediate(TmpFlt); // FLD0
	TmpFlt.changeSign();
	addLegalFPImmediate(TmpFlt); // FLD0/FCHS

	bool ignored;
	APFloat TmpFlt2(+1.0);
	TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
	&ignored);
	addLegalFPImmediate(TmpFlt2); // FLD1
	TmpFlt2.changeSign();
	addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
	}

	if (!TM.Options.UnsafeFPMath) {
	setOperationAction(ISD::FSIN , MVT::f80, Expand);
	setOperationAction(ISD::FCOS , MVT::f80, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
	}

	setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
	setOperationAction(ISD::FCEIL, MVT::f80, Expand);
	setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
	setOperationAction(ISD::FRINT, MVT::f80, Expand);
	setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
	setOperationAction(ISD::FMA, MVT::f80, Expand);
	}

	// Always use a library call for pow.
	setOperationAction(ISD::FPOW , MVT::f32 , Expand);
	setOperationAction(ISD::FPOW , MVT::f64 , Expand);
	setOperationAction(ISD::FPOW , MVT::f80 , Expand);

	setOperationAction(ISD::FLOG, MVT::f80, Expand);
	setOperationAction(ISD::FLOG2, MVT::f80, Expand);
	setOperationAction(ISD::FLOG10, MVT::f80, Expand);
	setOperationAction(ISD::FEXP, MVT::f80, Expand);
	setOperationAction(ISD::FEXP2, MVT::f80, Expand);
	setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
	setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);

	// Some FP actions are always expanded for vector types.
	for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
	MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
	setOperationAction(ISD::FSIN, VT, Expand);
	setOperationAction(ISD::FSINCOS, VT, Expand);
	setOperationAction(ISD::FCOS, VT, Expand);
	setOperationAction(ISD::FREM, VT, Expand);
	setOperationAction(ISD::FCOPYSIGN, VT, Expand);
	setOperationAction(ISD::FPOW, VT, Expand);
	setOperationAction(ISD::FLOG, VT, Expand);
	setOperationAction(ISD::FLOG2, VT, Expand);
	setOperationAction(ISD::FLOG10, VT, Expand);
	setOperationAction(ISD::FEXP, VT, Expand);
	setOperationAction(ISD::FEXP2, VT, Expand);
	}

	// First set operation action for all vector types to either promote
	// (for widening) or expand (for scalarization). Then we will selectively
	// turn on ones that can be effectively codegen'd.
	for (MVT VT : MVT::vector_valuetypes()) {
	setOperationAction(ISD::SDIV, VT, Expand);
	setOperationAction(ISD::UDIV, VT, Expand);
	setOperationAction(ISD::SREM, VT, Expand);
	setOperationAction(ISD::UREM, VT, Expand);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
	setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
	setOperationAction(ISD::FMA, VT, Expand);
	setOperationAction(ISD::FFLOOR, VT, Expand);
	setOperationAction(ISD::FCEIL, VT, Expand);
	setOperationAction(ISD::FTRUNC, VT, Expand);
	setOperationAction(ISD::FRINT, VT, Expand);
	setOperationAction(ISD::FNEARBYINT, VT, Expand);
	setOperationAction(ISD::SMUL_LOHI, VT, Expand);
	setOperationAction(ISD::MULHS, VT, Expand);
	setOperationAction(ISD::UMUL_LOHI, VT, Expand);
	setOperationAction(ISD::MULHU, VT, Expand);
	setOperationAction(ISD::SDIVREM, VT, Expand);
	setOperationAction(ISD::UDIVREM, VT, Expand);
	setOperationAction(ISD::CTPOP, VT, Expand);
	setOperationAction(ISD::CTTZ, VT, Expand);
	setOperationAction(ISD::CTLZ, VT, Expand);
	setOperationAction(ISD::ROTL, VT, Expand);
	setOperationAction(ISD::ROTR, VT, Expand);
	setOperationAction(ISD::BSWAP, VT, Expand);
	setOperationAction(ISD::SETCC, VT, Expand);
	setOperationAction(ISD::FP_TO_UINT, VT, Expand);
	setOperationAction(ISD::FP_TO_SINT, VT, Expand);
	setOperationAction(ISD::UINT_TO_FP, VT, Expand);
	setOperationAction(ISD::SINT_TO_FP, VT, Expand);
	setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
	setOperationAction(ISD::TRUNCATE, VT, Expand);
	setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
	setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
	setOperationAction(ISD::ANY_EXTEND, VT, Expand);
	setOperationAction(ISD::SELECT_CC, VT, Expand);
	for (MVT InnerVT : MVT::vector_valuetypes()) {
	setTruncStoreAction(InnerVT, VT, Expand);

	setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
	setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);

	// N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
	// types, we have to deal with them whether we ask for Expansion or not.
	// Setting Expand causes its own optimisation problems though, so leave
	// them legal.
	if (VT.getVectorElementType() == MVT::i1)
	setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);

	// EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
	// split/scalarized right now.
	if (VT.getVectorElementType() == MVT::f16)
	setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
	}
	}

	// FIXME: In order to prevent SSE instructions being expanded to MMX ones
	// with -msoft-float, disable use of MMX as well.
	if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
	addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
	// No operations on x86mmx supported, everything uses intrinsics.
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
	addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);

	setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
	setOperationAction(ISD::FABS, MVT::v4f32, Custom);
	setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
	setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
	setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
	setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
	addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);

	// FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
	// registers cannot be used even for integer operations.
	addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);
	addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);
	addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);
	addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);

	setOperationAction(ISD::MUL, MVT::v16i8, Custom);
	setOperationAction(ISD::MUL, MVT::v4i32, Custom);
	setOperationAction(ISD::MUL, MVT::v2i64, Custom);
	setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom);
	setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom);
	setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
	setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
	setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
	setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
	setOperationAction(ISD::MUL, MVT::v8i16, Legal);
	setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
	setOperationAction(ISD::FABS, MVT::v2f64, Custom);
	setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);

	setOperationAction(ISD::SMAX, MVT::v8i16, Legal);
	setOperationAction(ISD::UMAX, MVT::v16i8, Legal);
	setOperationAction(ISD::SMIN, MVT::v8i16, Legal);
	setOperationAction(ISD::UMIN, MVT::v16i8, Legal);

	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);

	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::CTPOP, VT, Custom);
	setOperationAction(ISD::CTTZ, VT, Custom);
	}

	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
	setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	}

	// We support custom legalizing of sext and anyext loads for specific
	// memory vector types which we can load as a scalar (or sequence of
	// scalars) and extend in-register to a legal 128-bit vector type. For sext
	// loads these must work with a single scalar load.
	for (MVT VT : MVT::integer_vector_valuetypes()) {
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
	}

	for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Custom);

	if (VT == MVT::v2i64 && !Subtarget.is64Bit())
	continue;

	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	}

	// Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
	setOperationPromotedToType(ISD::AND, VT, MVT::v2i64);
	setOperationPromotedToType(ISD::OR, VT, MVT::v2i64);
	setOperationPromotedToType(ISD::XOR, VT, MVT::v2i64);
	setOperationPromotedToType(ISD::LOAD, VT, MVT::v2i64);
	setOperationPromotedToType(ISD::SELECT, VT, MVT::v2i64);
	}

	// Custom lower v2i64 and v2f64 selects.
	setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
	setOperationAction(ISD::SELECT, MVT::v2i64, Custom);

	setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
	setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);

	setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
	setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);

	setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);

	// Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
	setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);

	setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
	setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);

	for (MVT VT : MVT::fp_vector_valuetypes())
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);

	setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
	setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
	setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);

	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);

	// In the customized shift lowering, the legal v4i32/v2i64 cases
	// in AVX2 will be recognized.
	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
	setOperationAction(ISD::SRL, VT, Custom);
	setOperationAction(ISD::SHL, VT, Custom);
	setOperationAction(ISD::SRA, VT, Custom);
	}
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
	setOperationAction(ISD::ABS, MVT::v16i8, Legal);
	setOperationAction(ISD::ABS, MVT::v8i16, Legal);
	setOperationAction(ISD::ABS, MVT::v4i32, Legal);
	setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
	setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
	setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
	setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
	setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
	for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
	setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
	setOperationAction(ISD::FCEIL, RoundedTy, Legal);
	setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
	setOperationAction(ISD::FRINT, RoundedTy, Legal);
	setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
	}

	setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
	setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
	setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
	setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
	setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
	setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
	setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
	setOperationAction(ISD::UMIN, MVT::v4i32, Legal);

	// FIXME: Do we need to handle scalar-to-vector here?
	setOperationAction(ISD::MUL, MVT::v4i32, Legal);

	// We directly match byte blends in the backend as they match the VSELECT
	// condition form.
	setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);

	// SSE41 brings specific instructions for doing vector sign extend even in
	// cases where we don't have SRA.
	for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
	setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
	}

	for (MVT VT : MVT::integer_vector_valuetypes()) {
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
	}

	// SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
	for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
	setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
	setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
	setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
	}

	// i8 vectors are custom because the source register and source
	// source memory operand types are not the same width.
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
	MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
	setOperationAction(ISD::ROTL, VT, Custom);

	// XOP can efficiently perform BITREVERSE with VPPERM.
	for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
	setOperationAction(ISD::BITREVERSE, VT, Custom);

	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
	MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
	setOperationAction(ISD::BITREVERSE, VT, Custom);
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) {
	bool HasInt256 = Subtarget.hasInt256();

	addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);
	addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);
	addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);
	addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);
	addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);
	addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);

	for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
	setOperationAction(ISD::FFLOOR, VT, Legal);
	setOperationAction(ISD::FCEIL, VT, Legal);
	setOperationAction(ISD::FTRUNC, VT, Legal);
	setOperationAction(ISD::FRINT, VT, Legal);
	setOperationAction(ISD::FNEARBYINT, VT, Legal);
	setOperationAction(ISD::FNEG, VT, Custom);
	setOperationAction(ISD::FABS, VT, Custom);
	setOperationAction(ISD::FCOPYSIGN, VT, Custom);
	}

	// (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
	// even though v8i16 is a legal type.
	setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Promote);
	setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Promote);
	setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);

	setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote);
	setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
	setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal);

	setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);

	for (MVT VT : MVT::fp_vector_valuetypes())
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);

	// In the customized shift lowering, the legal v8i32/v4i64 cases
	// in AVX2 will be recognized.
	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
	setOperationAction(ISD::SRL, VT, Custom);
	setOperationAction(ISD::SHL, VT, Custom);
	setOperationAction(ISD::SRA, VT, Custom);
	}

	setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
	setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
	setOperationAction(ISD::SELECT, MVT::v8f32, Custom);

	for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
	setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
	setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
	setOperationAction(ISD::ANY_EXTEND, VT, Custom);
	}

	setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
	setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);

	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::CTPOP, VT, Custom);
	setOperationAction(ISD::CTTZ, VT, Custom);
	setOperationAction(ISD::CTLZ, VT, Custom);
	}

	if (Subtarget.hasAnyFMA()) {
	for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
	MVT::v2f64, MVT::v4f64 })
	setOperationAction(ISD::FMA, VT, Legal);
	}

	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
	setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
	}

	setOperationAction(ISD::MUL, MVT::v4i64, Custom);
	setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::MUL, MVT::v32i8, Custom);

	setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom);
	setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom);

	setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
	setOperationAction(ISD::MULHS, MVT::v32i8, Custom);

	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
	setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
	}

	if (HasInt256) {
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64, Custom);
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i32, Custom);
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i16, Custom);

	// The custom lowering for UINT_TO_FP for v8i32 becomes interesting
	// when we have a 256bit-wide blend with immediate.
	setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);

	// AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
	for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
	setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
	setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
	setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
	}
	}

	for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
	MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
	setOperationAction(ISD::MLOAD, VT, Legal);
	setOperationAction(ISD::MSTORE, VT, Legal);
	}

	// Extract subvector is special because the value type
	// (result) is 128-bit but the source is 256-bit wide.
	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
	MVT::v4f32, MVT::v2f64 }) {
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
	}

	// Custom lower several nodes for 256-bit types.
	for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
	MVT::v8f32, MVT::v4f64 }) {
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
	setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
	}

	if (HasInt256)
	setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);

	// Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
	setOperationPromotedToType(ISD::AND, VT, MVT::v4i64);
	setOperationPromotedToType(ISD::OR, VT, MVT::v4i64);
	setOperationPromotedToType(ISD::XOR, VT, MVT::v4i64);
	setOperationPromotedToType(ISD::LOAD, VT, MVT::v4i64);
	setOperationPromotedToType(ISD::SELECT, VT, MVT::v4i64);
	}
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
	addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
	addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
	addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
	addRegisterClass(MVT::v8f64, &X86::VR512RegClass);

	addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
	addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
	addRegisterClass(MVT::v16i1, &X86::VK16RegClass);

	for (MVT VT : MVT::fp_vector_valuetypes())
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);

	for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
	setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
	setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
	setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
	setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
	setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
	setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
	}

	for (MVT VT : {MVT::v2i64, MVT::v4i32, MVT::v8i32, MVT::v4i64, MVT::v8i16,
	MVT::v16i8, MVT::v16i16, MVT::v32i8, MVT::v16i32,
	MVT::v8i64, MVT::v32i16, MVT::v64i8}) {
	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
	setLoadExtAction(ISD::SEXTLOAD, VT, MaskVT, Custom);
	setLoadExtAction(ISD::ZEXTLOAD, VT, MaskVT, Custom);
	setLoadExtAction(ISD::EXTLOAD, VT, MaskVT, Custom);
	setTruncStoreAction(VT, MaskVT, Custom);
	}

	for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
	setOperationAction(ISD::FNEG, VT, Custom);
	setOperationAction(ISD::FABS, VT, Custom);
	setOperationAction(ISD::FMA, VT, Legal);
	setOperationAction(ISD::FCOPYSIGN, VT, Custom);
	}

	setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
	setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
	setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
	setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
	setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
	setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::v16i8, Promote);
	setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Promote);
	setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
	setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
	setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
	setOperationAction(ISD::UINT_TO_FP, MVT::v16i8, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v16i1, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v8i1, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::v4i1, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v4i1, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::v2i1, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v2i1, Custom);
	setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal);
	setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);

	setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
	setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
	setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
	setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
	setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
	if (Subtarget.hasVLX()){
	setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
	setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
	setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
	setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
	setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);

	setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
	setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
	setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
	setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
	setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
	} else {
	for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
	MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
	setOperationAction(ISD::MLOAD, VT, Custom);
	setOperationAction(ISD::MSTORE, VT, Custom);
	}
	}
	setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);

	if (Subtarget.hasDQI()) {
	for (auto VT : { MVT::v2i64, MVT::v4i64, MVT::v8i64 }) {
	setOperationAction(ISD::SINT_TO_FP, VT, Legal);
	setOperationAction(ISD::UINT_TO_FP, VT, Legal);
	setOperationAction(ISD::FP_TO_SINT, VT, Legal);
	setOperationAction(ISD::FP_TO_UINT, VT, Legal);
	}
	if (Subtarget.hasVLX()) {
	// Fast v2f32 SINT_TO_FP( v2i32 ) custom conversion.
	setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
	setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
	}
	}
	if (Subtarget.hasVLX()) {
	setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
	setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
	setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
	setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
	setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
	setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
	setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom);

	// FIXME. This commands are available on SSE/AVX2, add relevant patterns.
	setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
	setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
	setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
	setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
	setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
	setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
	setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
	setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
	setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
	setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
	}

	setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
	setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
	setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);

	for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
	setOperationAction(ISD::FFLOOR, VT, Legal);
	setOperationAction(ISD::FCEIL, VT, Legal);
	setOperationAction(ISD::FTRUNC, VT, Legal);
	setOperationAction(ISD::FRINT, VT, Legal);
	setOperationAction(ISD::FNEARBYINT, VT, Legal);
	}

	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i64, Custom);
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i32, Custom);

	// Without BWI we need to use custom lowering to handle MVT::v64i8 input.
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
	setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);

	setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom);

	setOperationAction(ISD::MUL, MVT::v8i64, Custom);

	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom);
	setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);
	setOperationAction(ISD::SELECT, MVT::v8f64, Custom);
	setOperationAction(ISD::SELECT, MVT::v8i64, Custom);
	setOperationAction(ISD::SELECT, MVT::v16f32, Custom);

	setOperationAction(ISD::MUL, MVT::v16i32, Legal);

	// NonVLX sub-targets extend 128/256 vectors to use the 512 version.
	setOperationAction(ISD::ABS, MVT::v4i64, Legal);
	setOperationAction(ISD::ABS, MVT::v2i64, Legal);

	for (auto VT : { MVT::v8i1, MVT::v16i1 }) {
	setOperationAction(ISD::ADD, VT, Custom);
	setOperationAction(ISD::SUB, VT, Custom);
	setOperationAction(ISD::MUL, VT, Custom);
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::SELECT, VT, Custom);
	setOperationAction(ISD::TRUNCATE, VT, Custom);

	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Expand);
	}

	for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
	setOperationAction(ISD::SMAX, VT, Legal);
	setOperationAction(ISD::UMAX, VT, Legal);
	setOperationAction(ISD::SMIN, VT, Legal);
	setOperationAction(ISD::UMIN, VT, Legal);
	setOperationAction(ISD::ABS, VT, Legal);
	setOperationAction(ISD::SRL, VT, Custom);
	setOperationAction(ISD::SHL, VT, Custom);
	setOperationAction(ISD::SRA, VT, Custom);
	setOperationAction(ISD::CTPOP, VT, Custom);
	setOperationAction(ISD::CTTZ, VT, Custom);
	}

	// NonVLX sub-targets extend 128/256 vectors to use the 512 version.
	for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64, MVT::v4i64,
	MVT::v8i64}) {
	setOperationAction(ISD::ROTL, VT, Custom);
	setOperationAction(ISD::ROTR, VT, Custom);
	}

	// Need to promote to 64-bit even though we have 32-bit masked instructions
	// because the IR optimizers rearrange bitcasts around logic ops leaving
	// too many variations to handle if we don't promote them.
	setOperationPromotedToType(ISD::AND, MVT::v16i32, MVT::v8i64);
	setOperationPromotedToType(ISD::OR, MVT::v16i32, MVT::v8i64);
	setOperationPromotedToType(ISD::XOR, MVT::v16i32, MVT::v8i64);

	if (Subtarget.hasCDI()) {
	// NonVLX sub-targets extend 128/256 vectors to use the 512 version.
	for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,
	MVT::v4i64, MVT::v8i64}) {
	setOperationAction(ISD::CTLZ, VT, Legal);
	setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
	}
	} // Subtarget.hasCDI()

	if (Subtarget.hasDQI()) {
	// NonVLX sub-targets extend 128/256 vectors to use the 512 version.
	setOperationAction(ISD::MUL, MVT::v2i64, Legal);
	setOperationAction(ISD::MUL, MVT::v4i64, Legal);
	setOperationAction(ISD::MUL, MVT::v8i64, Legal);
	}

	if (Subtarget.hasVPOPCNTDQ()) {
	// VPOPCNTDQ sub-targets extend 128/256 vectors to use the avx512
	// version of popcntd/q.
	for (auto VT : {MVT::v16i32, MVT::v8i64, MVT::v8i32, MVT::v4i64,
	MVT::v4i32, MVT::v2i64})
	setOperationAction(ISD::CTPOP, VT, Legal);
	}

	// Custom lower several nodes.
	for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
	MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
	setOperationAction(ISD::MGATHER, VT, Custom);
	setOperationAction(ISD::MSCATTER, VT, Custom);
	}
	// Extract subvector is special because the value type
	// (result) is 256-bit but the source is 512-bit wide.
	// 128-bit was made Custom under AVX1.
	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
	- MVT::v8f32, MVT::v4f64 })
	+ MVT::v8f32, MVT::v4f64, MVT::v1i1 })
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
	for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1,
	MVT::v16i1, MVT::v32i1, MVT::v64i1 })
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);

	for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
	setOperationAction(ISD::MLOAD, VT, Legal);
	setOperationAction(ISD::MSTORE, VT, Legal);
	setOperationAction(ISD::MGATHER, VT, Legal);
	setOperationAction(ISD::MSCATTER, VT, Custom);
	}
	for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
	setOperationPromotedToType(ISD::LOAD, VT, MVT::v8i64);
	setOperationPromotedToType(ISD::SELECT, VT, MVT::v8i64);
	}
	}// has AVX-512

	if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
	addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
	addRegisterClass(MVT::v64i8, &X86::VR512RegClass);

	addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
	addRegisterClass(MVT::v64i1, &X86::VK64RegClass);

	setOperationAction(ISD::ADD, MVT::v32i1, Custom);
	setOperationAction(ISD::ADD, MVT::v64i1, Custom);
	setOperationAction(ISD::SUB, MVT::v32i1, Custom);
	setOperationAction(ISD::SUB, MVT::v64i1, Custom);
	setOperationAction(ISD::MUL, MVT::v32i1, Custom);
	setOperationAction(ISD::MUL, MVT::v64i1, Custom);

	setOperationAction(ISD::SETCC, MVT::v32i1, Custom);
	setOperationAction(ISD::SETCC, MVT::v64i1, Custom);
	setOperationAction(ISD::MUL, MVT::v32i16, Legal);
	setOperationAction(ISD::MUL, MVT::v64i8, Custom);
	setOperationAction(ISD::MULHS, MVT::v32i16, Legal);
	setOperationAction(ISD::MULHU, MVT::v32i16, Legal);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Legal);
	setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Legal);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i1, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i1, Custom);
	setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom);
	setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom);
	setOperationAction(ISD::SELECT, MVT::v32i1, Custom);
	setOperationAction(ISD::SELECT, MVT::v64i1, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
	setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i1, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i1, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v32i1, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v64i1, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i1, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i1, Custom);
	setOperationAction(ISD::BUILD_VECTOR, MVT::v32i1, Custom);
	setOperationAction(ISD::BUILD_VECTOR, MVT::v64i1, Custom);
	setOperationAction(ISD::VSELECT, MVT::v32i1, Expand);
	setOperationAction(ISD::VSELECT, MVT::v64i1, Expand);
	setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);

	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);

	setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
	if (Subtarget.hasVLX()) {
	setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
	setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
	}

	LegalizeAction Action = Subtarget.hasVLX() ? Legal : Custom;
	for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
	setOperationAction(ISD::MLOAD, VT, Action);
	setOperationAction(ISD::MSTORE, VT, Action);
	}

	if (Subtarget.hasCDI()) {
	setOperationAction(ISD::CTLZ, MVT::v32i16, Custom);
	setOperationAction(ISD::CTLZ, MVT::v64i8, Custom);
	}

	for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Custom);
	setOperationAction(ISD::ABS, VT, Legal);
	setOperationAction(ISD::SRL, VT, Custom);
	setOperationAction(ISD::SHL, VT, Custom);
	setOperationAction(ISD::SRA, VT, Custom);
	setOperationAction(ISD::MLOAD, VT, Legal);
	setOperationAction(ISD::MSTORE, VT, Legal);
	setOperationAction(ISD::CTPOP, VT, Custom);
	setOperationAction(ISD::CTTZ, VT, Custom);
	setOperationAction(ISD::SMAX, VT, Legal);
	setOperationAction(ISD::UMAX, VT, Legal);
	setOperationAction(ISD::SMIN, VT, Legal);
	setOperationAction(ISD::UMIN, VT, Legal);

	setOperationPromotedToType(ISD::AND, VT, MVT::v8i64);
	setOperationPromotedToType(ISD::OR, VT, MVT::v8i64);
	setOperationPromotedToType(ISD::XOR, VT, MVT::v8i64);
	}

	for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
	setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
	if (Subtarget.hasVLX()) {
	// FIXME. This commands are available on SSE/AVX2, add relevant patterns.
	setLoadExtAction(ExtType, MVT::v16i16, MVT::v16i8, Legal);
	setLoadExtAction(ExtType, MVT::v8i16, MVT::v8i8, Legal);
	}
	}
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
	addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
	addRegisterClass(MVT::v2i1, &X86::VK2RegClass);

	for (auto VT : { MVT::v2i1, MVT::v4i1 }) {
	setOperationAction(ISD::ADD, VT, Custom);
	setOperationAction(ISD::SUB, VT, Custom);
	setOperationAction(ISD::MUL, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Expand);

	setOperationAction(ISD::TRUNCATE, VT, Custom);
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::SELECT, VT, Custom);
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	}

	setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom);

	for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
	setOperationAction(ISD::SMAX, VT, Legal);
	setOperationAction(ISD::UMAX, VT, Legal);
	setOperationAction(ISD::SMIN, VT, Legal);
	setOperationAction(ISD::UMIN, VT, Legal);
	}
	}

	// We want to custom lower some of our intrinsics.
	setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
	setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
	setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
	if (!Subtarget.is64Bit()) {
	setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
	setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
	}

	// Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
	// handle type legalization for these operations here.
	//
	// FIXME: We really should do custom legalization for addition and
	// subtraction on x86-32 once PR3203 is fixed. We really can't do much better
	// than generic legalization for 64-bit multiplication-with-overflow, though.
	for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
	if (VT == MVT::i64 && !Subtarget.is64Bit())
	continue;
	// Add/Sub/Mul with overflow operations are custom lowered.
	setOperationAction(ISD::SADDO, VT, Custom);
	setOperationAction(ISD::UADDO, VT, Custom);
	setOperationAction(ISD::SSUBO, VT, Custom);
	setOperationAction(ISD::USUBO, VT, Custom);
	setOperationAction(ISD::SMULO, VT, Custom);
	setOperationAction(ISD::UMULO, VT, Custom);

	// Support carry in as value rather than glue.
	setOperationAction(ISD::ADDCARRY, VT, Custom);
	setOperationAction(ISD::SUBCARRY, VT, Custom);
	setOperationAction(ISD::SETCCCARRY, VT, Custom);
	}

	if (!Subtarget.is64Bit()) {
	// These libcalls are not available in 32-bit.
	setLibcallName(RTLIB::SHL_I128, nullptr);
	setLibcallName(RTLIB::SRL_I128, nullptr);
	setLibcallName(RTLIB::SRA_I128, nullptr);
	}

	// Combine sin / cos into one node or libcall if possible.
	if (Subtarget.hasSinCos()) {
	setLibcallName(RTLIB::SINCOS_F32, "sincosf");
	setLibcallName(RTLIB::SINCOS_F64, "sincos");
	if (Subtarget.isTargetDarwin()) {
	// For MacOSX, we don't want the normal expansion of a libcall to sincos.
	// We want to issue a libcall to __sincos_stret to avoid memory traffic.
	setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
	setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
	}
	}

	if (Subtarget.isTargetWin64()) {
	setOperationAction(ISD::SDIV, MVT::i128, Custom);
	setOperationAction(ISD::UDIV, MVT::i128, Custom);
	setOperationAction(ISD::SREM, MVT::i128, Custom);
	setOperationAction(ISD::UREM, MVT::i128, Custom);
	setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
	setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
	}

	// On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
	// is. We should promote the value to 64-bits to solve this.
	// This is what the CRT headers do - `fmodf` is an inline header
	// function casting to f64 and calling `fmod`.
	if (Subtarget.is32Bit() && (Subtarget.isTargetKnownWindowsMSVC() \|\|
	Subtarget.isTargetWindowsItanium()))
	for (ISD::NodeType Op :
	{ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
	ISD::FLOG10, ISD::FPOW, ISD::FSIN})
	if (isOperationExpand(Op, MVT::f32))
	setOperationAction(Op, MVT::f32, Promote);

	// We have target-specific dag combine patterns for the following nodes:
	setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
	setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
	setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
	setTargetDAGCombine(ISD::BITCAST);
	setTargetDAGCombine(ISD::VSELECT);
	setTargetDAGCombine(ISD::SELECT);
	setTargetDAGCombine(ISD::SHL);
	setTargetDAGCombine(ISD::SRA);
	setTargetDAGCombine(ISD::SRL);
	setTargetDAGCombine(ISD::OR);
	setTargetDAGCombine(ISD::AND);
	setTargetDAGCombine(ISD::ADD);
	setTargetDAGCombine(ISD::FADD);
	setTargetDAGCombine(ISD::FSUB);
	setTargetDAGCombine(ISD::FNEG);
	setTargetDAGCombine(ISD::FMA);
	setTargetDAGCombine(ISD::FMINNUM);
	setTargetDAGCombine(ISD::FMAXNUM);
	setTargetDAGCombine(ISD::SUB);
	setTargetDAGCombine(ISD::LOAD);
	setTargetDAGCombine(ISD::MLOAD);
	setTargetDAGCombine(ISD::STORE);
	setTargetDAGCombine(ISD::MSTORE);
	setTargetDAGCombine(ISD::TRUNCATE);
	setTargetDAGCombine(ISD::ZERO_EXTEND);
	setTargetDAGCombine(ISD::ANY_EXTEND);
	setTargetDAGCombine(ISD::SIGN_EXTEND);
	setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
	setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
	setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
	setTargetDAGCombine(ISD::SINT_TO_FP);
	setTargetDAGCombine(ISD::UINT_TO_FP);
	setTargetDAGCombine(ISD::SETCC);
	setTargetDAGCombine(ISD::MUL);
	setTargetDAGCombine(ISD::XOR);
	setTargetDAGCombine(ISD::MSCATTER);
	setTargetDAGCombine(ISD::MGATHER);

	computeRegisterProperties(Subtarget.getRegisterInfo());

	MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
	MaxStoresPerMemsetOptSize = 8;
	MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
	MaxStoresPerMemcpyOptSize = 4;
	MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
	MaxStoresPerMemmoveOptSize = 4;

	// TODO: These control memcmp expansion in CGP and could be raised higher, but
	// that needs to benchmarked and balanced with the potential use of vector
	// load/store types (PR33329, PR33914).
	MaxLoadsPerMemcmp = 2;
	MaxLoadsPerMemcmpOptSize = 2;

	// Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
	setPrefLoopAlignment(ExperimentalPrefLoopAlignment);

	// An out-of-order CPU can speculatively execute past a predictable branch,
	// but a conditional move could be stalled by an expensive earlier operation.
	PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
	EnableExtLdPromotion = true;
	setPrefFunctionAlignment(4); // 2^4 bytes.

	verifyIntrinsicTables();
	}

	// This has so far only been implemented for 64-bit MachO.
	bool X86TargetLowering::useLoadStackGuardNode() const {
	return Subtarget.isTargetMachO() && Subtarget.is64Bit();
	}

	TargetLoweringBase::LegalizeTypeAction
	X86TargetLowering::getPreferredVectorAction(EVT VT) const {
	if (ExperimentalVectorWideningLegalization &&
	VT.getVectorNumElements() != 1 &&
	VT.getVectorElementType().getSimpleVT() != MVT::i1)
	return TypeWidenVector;

	return TargetLoweringBase::getPreferredVectorAction(VT);
	}

	EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
	LLVMContext& Context,
	EVT VT) const {
	if (!VT.isVector())
	return MVT::i8;

	if (VT.isSimple()) {
	MVT VVT = VT.getSimpleVT();
	const unsigned NumElts = VVT.getVectorNumElements();
	MVT EltVT = VVT.getVectorElementType();
	if (VVT.is512BitVector()) {
	if (Subtarget.hasAVX512())
	if (EltVT == MVT::i32 \|\| EltVT == MVT::i64 \|\|
	EltVT == MVT::f32 \|\| EltVT == MVT::f64)
	switch(NumElts) {
	case 8: return MVT::v8i1;
	case 16: return MVT::v16i1;
	}
	if (Subtarget.hasBWI())
	if (EltVT == MVT::i8 \|\| EltVT == MVT::i16)
	switch(NumElts) {
	case 32: return MVT::v32i1;
	case 64: return MVT::v64i1;
	}
	}

	if (Subtarget.hasBWI() && Subtarget.hasVLX())
	return MVT::getVectorVT(MVT::i1, NumElts);

	if (!isTypeLegal(VT) && getTypeAction(Context, VT) == TypePromoteInteger) {
	EVT LegalVT = getTypeToTransformTo(Context, VT);
	EltVT = LegalVT.getVectorElementType().getSimpleVT();
	}

	if (Subtarget.hasVLX() && EltVT.getSizeInBits() >= 32)
	switch(NumElts) {
	case 2: return MVT::v2i1;
	case 4: return MVT::v4i1;
	case 8: return MVT::v8i1;
	}
	}

	return VT.changeVectorElementTypeToInteger();
	}

	/// Helper for getByValTypeAlignment to determine
	/// the desired ByVal argument alignment.
	static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
	if (MaxAlign == 16)
	return;
	if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
	if (VTy->getBitWidth() == 128)
	MaxAlign = 16;
	} else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
	unsigned EltAlign = 0;
	getMaxByValAlign(ATy->getElementType(), EltAlign);
	if (EltAlign > MaxAlign)
	MaxAlign = EltAlign;
	} else if (StructType *STy = dyn_cast<StructType>(Ty)) {
	for (auto *EltTy : STy->elements()) {
	unsigned EltAlign = 0;
	getMaxByValAlign(EltTy, EltAlign);
	if (EltAlign > MaxAlign)
	MaxAlign = EltAlign;
	if (MaxAlign == 16)
	break;
	}
	}
	}

	/// Return the desired alignment for ByVal aggregate
	/// function arguments in the caller parameter area. For X86, aggregates
	/// that contain SSE vectors are placed at 16-byte boundaries while the rest
	/// are at 4-byte boundaries.
	unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
	const DataLayout &DL) const {
	if (Subtarget.is64Bit()) {
	// Max of 8 and alignment of type.
	unsigned TyAlign = DL.getABITypeAlignment(Ty);
	if (TyAlign > 8)
	return TyAlign;
	return 8;
	}

	unsigned Align = 4;
	if (Subtarget.hasSSE1())
	getMaxByValAlign(Ty, Align);
	return Align;
	}

	/// Returns the target specific optimal type for load
	/// and store operations as a result of memset, memcpy, and memmove
	/// lowering. If DstAlign is zero that means it's safe to destination
	/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
	/// means there isn't a need to check it against alignment requirement,
	/// probably because the source does not need to be loaded. If 'IsMemset' is
	/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
	/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
	/// source is constant so it does not need to be loaded.
	/// It returns EVT::Other if the type should be determined using generic
	/// target-independent logic.
	EVT
	X86TargetLowering::getOptimalMemOpType(uint64_t Size,
	unsigned DstAlign, unsigned SrcAlign,
	bool IsMemset, bool ZeroMemset,
	bool MemcpyStrSrc,
	MachineFunction &MF) const {
	const Function *F = MF.getFunction();
	if (!F->hasFnAttribute(Attribute::NoImplicitFloat)) {
	if (Size >= 16 &&
	(!Subtarget.isUnalignedMem16Slow() \|\|
	((DstAlign == 0 \|\| DstAlign >= 16) &&
	(SrcAlign == 0 \|\| SrcAlign >= 16)))) {
	// FIXME: Check if unaligned 32-byte accesses are slow.
	if (Size >= 32 && Subtarget.hasAVX()) {
	// Although this isn't a well-supported type for AVX1, we'll let
	// legalization and shuffle lowering produce the optimal codegen. If we
	// choose an optimal type with a vector element larger than a byte,
	// getMemsetStores() may create an intermediate splat (using an integer
	// multiply) before we splat as a vector.
	return MVT::v32i8;
	}
	if (Subtarget.hasSSE2())
	return MVT::v16i8;
	// TODO: Can SSE1 handle a byte vector?
	if (Subtarget.hasSSE1())
	return MVT::v4f32;
	} else if ((!IsMemset \|\| ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
	!Subtarget.is64Bit() && Subtarget.hasSSE2()) {
	// Do not use f64 to lower memcpy if source is string constant. It's
	// better to use i32 to avoid the loads.
	// Also, do not use f64 to lower memset unless this is a memset of zeros.
	// The gymnastics of splatting a byte value into an XMM register and then
	// only using 8-byte stores (because this is a CPU with slow unaligned
	// 16-byte accesses) makes that a loser.
	return MVT::f64;
	}
	}
	// This is a compromise. If we reach here, unaligned accesses may be slow on
	// this target. However, creating smaller, aligned accesses could be even
	// slower and would certainly be a lot more code.
	if (Subtarget.is64Bit() && Size >= 8)
	return MVT::i64;
	return MVT::i32;
	}

	bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
	if (VT == MVT::f32)
	return X86ScalarSSEf32;
	else if (VT == MVT::f64)
	return X86ScalarSSEf64;
	return true;
	}

	bool
	X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
	unsigned,
	unsigned,
	bool *Fast) const {
	if (Fast) {
	switch (VT.getSizeInBits()) {
	default:
	// 8-byte and under are always assumed to be fast.
	*Fast = true;
	break;
	case 128:
	*Fast = !Subtarget.isUnalignedMem16Slow();
	break;
	case 256:
	*Fast = !Subtarget.isUnalignedMem32Slow();
	break;
	// TODO: What about AVX-512 (512-bit) accesses?
	}
	}
	// Misaligned accesses of any size are always allowed.
	return true;
	}

	/// Return the entry encoding for a jump table in the
	/// current function. The returned value is a member of the
	/// MachineJumpTableInfo::JTEntryKind enum.
	unsigned X86TargetLowering::getJumpTableEncoding() const {
	// In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
	// symbol.
	if (isPositionIndependent() && Subtarget.isPICStyleGOT())
	return MachineJumpTableInfo::EK_Custom32;

	// Otherwise, use the normal jump table encoding heuristics.
	return TargetLowering::getJumpTableEncoding();
	}

	bool X86TargetLowering::useSoftFloat() const {
	return Subtarget.useSoftFloat();
	}

	void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
	ArgListTy &Args) const {

	// Only relabel X86-32 for C / Stdcall CCs.
	if (Subtarget.is64Bit())
	return;
	if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
	return;
	unsigned ParamRegs = 0;
	if (auto *M = MF->getFunction()->getParent())
	ParamRegs = M->getNumberRegisterParameters();

	// Mark the first N int arguments as having reg
	for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
	Type *T = Args[Idx].Ty;
	if (T->isPointerTy() \|\| T->isIntegerTy())
	if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
	unsigned numRegs = 1;
	if (MF->getDataLayout().getTypeAllocSize(T) > 4)
	numRegs = 2;
	if (ParamRegs < numRegs)
	return;
	ParamRegs -= numRegs;
	Args[Idx].IsInReg = true;
	}
	}
	}

	const MCExpr *
	X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
	const MachineBasicBlock *MBB,
	unsigned uid,MCContext &Ctx) const{
	assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
	// In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
	// entries.
	return MCSymbolRefExpr::create(MBB->getSymbol(),
	MCSymbolRefExpr::VK_GOTOFF, Ctx);
	}

	/// Returns relocation base for the given PIC jumptable.
	SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
	SelectionDAG &DAG) const {
	if (!Subtarget.is64Bit())
	// This doesn't have SDLoc associated with it, but is not really the
	// same as a Register.
	return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
	getPointerTy(DAG.getDataLayout()));
	return Table;
	}

	/// This returns the relocation base for the given PIC jumptable,
	/// the same as getPICJumpTableRelocBase, but as an MCExpr.
	const MCExpr *X86TargetLowering::
	getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
	MCContext &Ctx) const {
	// X86-64 uses RIP relative addressing based on the jump table label.
	if (Subtarget.isPICStyleRIPRel())
	return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);

	// Otherwise, the reference is relative to the PIC base.
	return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
	}

	std::pair<const TargetRegisterClass *, uint8_t>
	X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
	MVT VT) const {
	const TargetRegisterClass *RRC = nullptr;
	uint8_t Cost = 1;
	switch (VT.SimpleTy) {
	default:
	return TargetLowering::findRepresentativeClass(TRI, VT);
	case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
	RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
	break;
	case MVT::x86mmx:
	RRC = &X86::VR64RegClass;
	break;
	case MVT::f32: case MVT::f64:
	case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
	case MVT::v4f32: case MVT::v2f64:
	case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
	case MVT::v8f32: case MVT::v4f64:
	case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
	case MVT::v16f32: case MVT::v8f64:
	RRC = &X86::VR128XRegClass;
	break;
	}
	return std::make_pair(RRC, Cost);
	}

	unsigned X86TargetLowering::getAddressSpace() const {
	if (Subtarget.is64Bit())
	return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
	return 256;
	}

	static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
	return TargetTriple.isOSGlibc() \|\| TargetTriple.isOSFuchsia() \|\|
	(TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
	}

	static Constant* SegmentOffset(IRBuilder<> &IRB,
	unsigned Offset, unsigned AddressSpace) {
	return ConstantExpr::getIntToPtr(
	ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
	Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
	}

	Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
	// glibc, bionic, and Fuchsia have a special slot for the stack guard in
	// tcbhead_t; use it instead of the usual global variable (see
	// sysdeps/{i386,x86_64}/nptl/tls.h)
	if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
	if (Subtarget.isTargetFuchsia()) {
	// <magenta/tls.h> defines MX_TLS_STACK_GUARD_OFFSET with this value.
	return SegmentOffset(IRB, 0x10, getAddressSpace());
	} else {
	// %fs:0x28, unless we're using a Kernel code model, in which case
	// it's %gs:0x28. gs:0x14 on i386.
	unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
	return SegmentOffset(IRB, Offset, getAddressSpace());
	}
	}

	return TargetLowering::getIRStackGuard(IRB);
	}

	void X86TargetLowering::insertSSPDeclarations(Module &M) const {
	// MSVC CRT provides functionalities for stack protection.
	if (Subtarget.getTargetTriple().isOSMSVCRT()) {
	// MSVC CRT has a global variable holding security cookie.
	M.getOrInsertGlobal("__security_cookie",
	Type::getInt8PtrTy(M.getContext()));

	// MSVC CRT has a function to validate security cookie.
	auto *SecurityCheckCookie = cast<Function>(
	M.getOrInsertFunction("__security_check_cookie",
	Type::getVoidTy(M.getContext()),
	Type::getInt8PtrTy(M.getContext())));
	SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
	SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
	return;
	}
	// glibc, bionic, and Fuchsia have a special slot for the stack guard.
	if (hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
	return;
	TargetLowering::insertSSPDeclarations(M);
	}

	Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
	// MSVC CRT has a global variable holding security cookie.
	if (Subtarget.getTargetTriple().isOSMSVCRT())
	return M.getGlobalVariable("__security_cookie");
	return TargetLowering::getSDagStackGuard(M);
	}

	Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
	// MSVC CRT has a function to validate security cookie.
	if (Subtarget.getTargetTriple().isOSMSVCRT())
	return M.getFunction("__security_check_cookie");
	return TargetLowering::getSSPStackGuardCheck(M);
	}

	Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
	if (Subtarget.getTargetTriple().isOSContiki())
	return getDefaultSafeStackPointerLocation(IRB, false);

	// Android provides a fixed TLS slot for the SafeStack pointer. See the
	// definition of TLS_SLOT_SAFESTACK in
	// https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
	if (Subtarget.isTargetAndroid()) {
	// %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
	// %gs:0x24 on i386
	unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
	return SegmentOffset(IRB, Offset, getAddressSpace());
	}

	// Fuchsia is similar.
	if (Subtarget.isTargetFuchsia()) {
	// <magenta/tls.h> defines MX_TLS_UNSAFE_SP_OFFSET with this value.
	return SegmentOffset(IRB, 0x18, getAddressSpace());
	}

	return TargetLowering::getSafeStackPointerLocation(IRB);
	}

	bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
	unsigned DestAS) const {
	assert(SrcAS != DestAS && "Expected different address spaces!");

	return SrcAS < 256 && DestAS < 256;
	}

	//===----------------------------------------------------------------------===//
	// Return Value Calling Convention Implementation
	//===----------------------------------------------------------------------===//

	#include "X86GenCallingConv.inc"

	bool X86TargetLowering::CanLowerReturn(
	CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
	return CCInfo.CheckReturn(Outs, RetCC_X86);
	}

	const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
	static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
	return ScratchRegs;
	}

	/// Lowers masks values (v*i1) to the local register values
	/// \returns DAG node after lowering to register type
	static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
	const SDLoc &Dl, SelectionDAG &DAG) {
	EVT ValVT = ValArg.getValueType();

	if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 \|\| ValLoc == MVT::i32)) \|\|
	(ValVT == MVT::v16i1 && (ValLoc == MVT::i16 \|\| ValLoc == MVT::i32))) {
	// Two stage lowering might be required
	// bitcast: v8i1 -> i8 / v16i1 -> i16
	// anyextend: i8 -> i32 / i16 -> i32
	EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
	SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
	if (ValLoc == MVT::i32)
	ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
	return ValToCopy;
	} else if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) \|\|
	(ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
	// One stage lowering is required
	// bitcast: v32i1 -> i32 / v64i1 -> i64
	return DAG.getBitcast(ValLoc, ValArg);
	} else
	return DAG.getNode(ISD::SIGN_EXTEND, Dl, ValLoc, ValArg);
	}

	/// Breaks v64i1 value into two registers and adds the new node to the DAG
	static void Passv64i1ArgInRegs(
	const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg,
	SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, CCValAssign &VA,
	CCValAssign &NextVA, const X86Subtarget &Subtarget) {
	assert((Subtarget.hasBWI() \|\| Subtarget.hasBMI()) &&
	"Expected AVX512BW or AVX512BMI target!");
	assert(Subtarget.is32Bit() && "Expecting 32 bit target");
	assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
	assert(VA.isRegLoc() && NextVA.isRegLoc() &&
	"The value should reside in two registers");

	// Before splitting the value we cast it to i64
	Arg = DAG.getBitcast(MVT::i64, Arg);

	// Splitting the value into two i32 types
	SDValue Lo, Hi;
	Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
	DAG.getConstant(0, Dl, MVT::i32));
	Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
	DAG.getConstant(1, Dl, MVT::i32));

	// Attach the two i32 types into corresponding registers
	RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
	RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
	}

	SDValue
	X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
	bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SDLoc &dl, SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

	// In some cases we need to disable registers from the default CSR list.
	// For example, when they are used for argument passing.
	bool ShouldDisableCalleeSavedRegister =
	CallConv == CallingConv::X86_RegCall \|\|
	MF.getFunction()->hasFnAttribute("no_caller_saved_registers");

	if (CallConv == CallingConv::X86_INTR && !Outs.empty())
	report_fatal_error("X86 interrupts may not return any value");

	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
	CCInfo.AnalyzeReturn(Outs, RetCC_X86);

	SDValue Flag;
	SmallVector<SDValue, 6> RetOps;
	RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
	// Operand #1 = Bytes To Pop
	RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
	MVT::i32));

	// Copy the result values into the output registers.
	for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
	++I, ++OutsIndex) {
	CCValAssign &VA = RVLocs[I];
	assert(VA.isRegLoc() && "Can only return in registers!");

	// Add the register to the CalleeSaveDisableRegs list.
	if (ShouldDisableCalleeSavedRegister)
	MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());

	SDValue ValToCopy = OutVals[OutsIndex];
	EVT ValVT = ValToCopy.getValueType();

	// Promote values to the appropriate types.
	if (VA.getLocInfo() == CCValAssign::SExt)
	ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
	else if (VA.getLocInfo() == CCValAssign::ZExt)
	ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
	else if (VA.getLocInfo() == CCValAssign::AExt) {
	if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
	ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
	else
	ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
	}
	else if (VA.getLocInfo() == CCValAssign::BCvt)
	ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);

	assert(VA.getLocInfo() != CCValAssign::FPExt &&
	"Unexpected FP-extend for return value.");

	// If this is x86-64, and we disabled SSE, we can't return FP values,
	// or SSE or MMX vectors.
	if ((ValVT == MVT::f32 \|\| ValVT == MVT::f64 \|\|
	VA.getLocReg() == X86::XMM0 \|\| VA.getLocReg() == X86::XMM1) &&
	(Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
	errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
	VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
	} else if (ValVT == MVT::f64 &&
	(Subtarget.is64Bit() && !Subtarget.hasSSE2())) {
	// Likewise we can't return F64 values with SSE1 only. gcc does so, but
	// llvm-gcc has never done it right and no one has noticed, so this
	// should be OK for now.
	errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
	VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
	}

	// Returns in ST0/ST1 are handled specially: these are pushed as operands to
	// the RET instruction and handled by the FP Stackifier.
	if (VA.getLocReg() == X86::FP0 \|\|
	VA.getLocReg() == X86::FP1) {
	// If this is a copy from an xmm register to ST(0), use an FPExtend to
	// change the value to the FP stack register class.
	if (isScalarFPTypeInSSEReg(VA.getValVT()))
	ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
	RetOps.push_back(ValToCopy);
	// Don't emit a copytoreg.
	continue;
	}

	// 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
	// which is returned in RAX / RDX.
	if (Subtarget.is64Bit()) {
	if (ValVT == MVT::x86mmx) {
	if (VA.getLocReg() == X86::XMM0 \|\| VA.getLocReg() == X86::XMM1) {
	ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
	ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
	ValToCopy);
	// If we don't have SSE2 available, convert to v4f32 so the generated
	// register is legal.
	if (!Subtarget.hasSSE2())
	ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
	}
	}
	}

	SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;

	if (VA.needsCustom()) {
	assert(VA.getValVT() == MVT::v64i1 &&
	"Currently the only custom case is when we split v64i1 to 2 regs");

	Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I],
	Subtarget);

	assert(2 == RegsToPass.size() &&
	"Expecting two registers after Pass64BitArgInRegs");

	// Add the second register to the CalleeSaveDisableRegs list.
	if (ShouldDisableCalleeSavedRegister)
	MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
	} else {
	RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
	}

	// Add nodes to the DAG and add the values into the RetOps list
	for (auto &Reg : RegsToPass) {
	Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag);
	Flag = Chain.getValue(1);
	RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
	}
	}

	// Swift calling convention does not require we copy the sret argument
	// into %rax/%eax for the return, and SRetReturnReg is not set for Swift.

	// All x86 ABIs require that for returning structs by value we copy
	// the sret argument into %rax/%eax (depending on ABI) for the return.
	// We saved the argument into a virtual register in the entry block,
	// so now we copy the value out and into %rax/%eax.
	//
	// Checking Function.hasStructRetAttr() here is insufficient because the IR
	// may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
	// false, then an sret argument may be implicitly inserted in the SelDAG. In
	// either case FuncInfo->setSRetReturnReg() will have been called.
	if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
	// When we have both sret and another return value, we should use the
	// original Chain stored in RetOps[0], instead of the current Chain updated
	// in the above loop. If we only have sret, RetOps[0] equals to Chain.

	// For the case of sret and another return value, we have
	// Chain_0 at the function entry
	// Chain_1 = getCopyToReg(Chain_0) in the above loop
	// If we use Chain_1 in getCopyFromReg, we will have
	// Val = getCopyFromReg(Chain_1)
	// Chain_2 = getCopyToReg(Chain_1, Val) from below

	// getCopyToReg(Chain_0) will be glued together with
	// getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
	// in Unit B, and we will have cyclic dependency between Unit A and Unit B:
	// Data dependency from Unit B to Unit A due to usage of Val in
	// getCopyToReg(Chain_1, Val)
	// Chain dependency from Unit A to Unit B

	// So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
	SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
	getPointerTy(MF.getDataLayout()));

	unsigned RetValReg
	= (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
	X86::RAX : X86::EAX;
	Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
	Flag = Chain.getValue(1);

	// RAX/EAX now acts like a return value.
	RetOps.push_back(
	DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));

	// Add the returned register to the CalleeSaveDisableRegs list.
	if (ShouldDisableCalleeSavedRegister)
	MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
	}

	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
	const MCPhysReg *I =
	TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
	if (I) {
	for (; *I; ++I) {
	if (X86::GR64RegClass.contains(*I))
	RetOps.push_back(DAG.getRegister(*I, MVT::i64));
	else
	llvm_unreachable("Unexpected register class in CSRsViaCopy!");
	}
	}

	RetOps[0] = Chain; // Update chain.

	// Add the flag if we have it.
	if (Flag.getNode())
	RetOps.push_back(Flag);

	X86ISD::NodeType opcode = X86ISD::RET_FLAG;
	if (CallConv == CallingConv::X86_INTR)
	opcode = X86ISD::IRET;
	return DAG.getNode(opcode, dl, MVT::Other, RetOps);
	}

	bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
	if (N->getNumValues() != 1 \|\| !N->hasNUsesOfValue(1, 0))
	return false;

	SDValue TCChain = Chain;
	SDNode Copy = N->use_begin();
	if (Copy->getOpcode() == ISD::CopyToReg) {
	// If the copy has a glue operand, we conservatively assume it isn't safe to
	// perform a tail call.
	if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
	return false;
	TCChain = Copy->getOperand(0);
	} else if (Copy->getOpcode() != ISD::FP_EXTEND)
	return false;

	bool HasRet = false;
	for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
	UI != UE; ++UI) {
	if (UI->getOpcode() != X86ISD::RET_FLAG)
	return false;
	// If we are returning more than one value, we can definitely
	// not make a tail call see PR19530
	if (UI->getNumOperands() > 4)
	return false;
	if (UI->getNumOperands() == 4 &&
	UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
	return false;
	HasRet = true;
	}

	if (!HasRet)
	return false;

	Chain = TCChain;
	return true;
	}

	EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
	ISD::NodeType ExtendKind) const {
	MVT ReturnMVT = MVT::i32;

	bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
	if (VT == MVT::i1 \|\| (!Darwin && (VT == MVT::i8 \|\| VT == MVT::i16))) {
	// The ABI does not require i1, i8 or i16 to be extended.
	//
	// On Darwin, there is code in the wild relying on Clang's old behaviour of
	// always extending i8/i16 return values, so keep doing that for now.
	// (PR26665).
	ReturnMVT = MVT::i8;
	}

	EVT MinVT = getRegisterType(Context, ReturnMVT);
	return VT.bitsLT(MinVT) ? MinVT : VT;
	}

	/// Reads two 32 bit registers and creates a 64 bit mask value.
	/// \param VA The current 32 bit value that need to be assigned.
	/// \param NextVA The next 32 bit value that need to be assigned.
	/// \param Root The parent DAG node.
	/// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
	/// glue purposes. In the case the DAG is already using
	/// physical register instead of virtual, we should glue
	/// our new SDValue to InFlag SDvalue.
	/// \return a new SDvalue of size 64bit.
	static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
	SDValue &Root, SelectionDAG &DAG,
	const SDLoc &Dl, const X86Subtarget &Subtarget,
	SDValue *InFlag = nullptr) {
	assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
	assert(Subtarget.is32Bit() && "Expecting 32 bit target");
	assert(VA.getValVT() == MVT::v64i1 &&
	"Expecting first location of 64 bit width type");
	assert(NextVA.getValVT() == VA.getValVT() &&
	"The locations should have the same type");
	assert(VA.isRegLoc() && NextVA.isRegLoc() &&
	"The values should reside in two registers");

	SDValue Lo, Hi;
	unsigned Reg;
	SDValue ArgValueLo, ArgValueHi;

	MachineFunction &MF = DAG.getMachineFunction();
	const TargetRegisterClass *RC = &X86::GR32RegClass;

	// Read a 32 bit value from the registers
	if (nullptr == InFlag) {
	// When no physical register is present,
	// create an intermediate virtual register
	Reg = MF.addLiveIn(VA.getLocReg(), RC);
	ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
	Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
	ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
	} else {
	// When a physical register is available read the value from it and glue
	// the reads together.
	ArgValueLo =
	DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
	*InFlag = ArgValueLo.getValue(2);
	ArgValueHi =
	DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
	*InFlag = ArgValueHi.getValue(2);
	}

	// Convert the i32 type into v32i1 type
	Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);

	// Convert the i32 type into v32i1 type
	Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);

	// Concatenate the two values together
	return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
	}

	/// The function will lower a register of various sizes (8/16/32/64)
	/// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
	/// \returns a DAG node contains the operand after lowering to mask type.
	static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
	const EVT &ValLoc, const SDLoc &Dl,
	SelectionDAG &DAG) {
	SDValue ValReturned = ValArg;

	if (ValVT == MVT::v1i1)
	return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);

	if (ValVT == MVT::v64i1) {
	// In 32 bit machine, this case is handled by getv64i1Argument
	assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
	// In 64 bit machine, There is no need to truncate the value only bitcast
	} else {
	MVT maskLen;
	switch (ValVT.getSimpleVT().SimpleTy) {
	case MVT::v8i1:
	maskLen = MVT::i8;
	break;
	case MVT::v16i1:
	maskLen = MVT::i16;
	break;
	case MVT::v32i1:
	maskLen = MVT::i32;
	break;
	default:
	llvm_unreachable("Expecting a vector of i1 types");
	}

	ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
	}
	return DAG.getBitcast(ValVT, ValReturned);
	}

	/// Lower the result values of a call into the
	/// appropriate copies out of appropriate physical registers.
	///
	SDValue X86TargetLowering::LowerCallResult(
	SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
	uint32_t *RegMask) const {

	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
	// Assign locations to each value returned by this call.
	SmallVector<CCValAssign, 16> RVLocs;
	bool Is64Bit = Subtarget.is64Bit();
	CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
	*DAG.getContext());
	CCInfo.AnalyzeCallResult(Ins, RetCC_X86);

	// Copy all of the result registers out of their specified physreg.
	for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
	++I, ++InsIndex) {
	CCValAssign &VA = RVLocs[I];
	EVT CopyVT = VA.getLocVT();

	// In some calling conventions we need to remove the used registers
	// from the register mask.
	if (RegMask) {
	for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /IncludeSelf=/true);
	SubRegs.isValid(); ++SubRegs)
	RegMask[SubRegs / 32] &= ~(1u << (SubRegs % 32));
	}

	// If this is x86-64, and we disabled SSE, we can't return FP values
	if ((CopyVT == MVT::f32 \|\| CopyVT == MVT::f64 \|\| CopyVT == MVT::f128) &&
	((Is64Bit \|\| Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
	errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
	VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
	}

	// If we prefer to use the value in xmm registers, copy it out as f80 and
	// use a truncate to move it from fp stack reg to xmm reg.
	bool RoundAfterCopy = false;
	if ((VA.getLocReg() == X86::FP0 \|\| VA.getLocReg() == X86::FP1) &&
	isScalarFPTypeInSSEReg(VA.getValVT())) {
	if (!Subtarget.hasX87())
	report_fatal_error("X87 register return with X87 disabled");
	CopyVT = MVT::f80;
	RoundAfterCopy = (CopyVT != VA.getLocVT());
	}

	SDValue Val;
	if (VA.needsCustom()) {
	assert(VA.getValVT() == MVT::v64i1 &&
	"Currently the only custom case is when we split v64i1 to 2 regs");
	Val =
	getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
	} else {
	Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
	.getValue(1);
	Val = Chain.getValue(0);
	InFlag = Chain.getValue(2);
	}

	if (RoundAfterCopy)
	Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
	// This truncation won't change the value.
	DAG.getIntPtrConstant(1, dl));

	if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
	if (VA.getValVT().isVector() &&
	((VA.getLocVT() == MVT::i64) \|\| (VA.getLocVT() == MVT::i32) \|\|
	(VA.getLocVT() == MVT::i16) \|\| (VA.getLocVT() == MVT::i8))) {
	// promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
	Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
	} else
	Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
	}

	InVals.push_back(Val);
	}

	return Chain;
	}

	//===----------------------------------------------------------------------===//
	// C & StdCall & Fast Calling Convention implementation
	//===----------------------------------------------------------------------===//
	// StdCall calling convention seems to be standard for many Windows' API
	// routines and around. It differs from C calling convention just a little:
	// callee should clean up the stack, not caller. Symbols should be also
	// decorated in some fancy way :) It doesn't support any vector arguments.
	// For info on fast calling convention see Fast Calling Convention (tail call)
	// implementation LowerX86_32FastCCCallTo.

	/// CallIsStructReturn - Determines whether a call uses struct return
	/// semantics.
	enum StructReturnType {
	NotStructReturn,
	RegStructReturn,
	StackStructReturn
	};
	static StructReturnType
	callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsMCU) {
	if (Outs.empty())
	return NotStructReturn;

	const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
	if (!Flags.isSRet())
	return NotStructReturn;
	if (Flags.isInReg() \|\| IsMCU)
	return RegStructReturn;
	return StackStructReturn;
	}

	/// Determines whether a function uses struct return semantics.
	static StructReturnType
	argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsMCU) {
	if (Ins.empty())
	return NotStructReturn;

	const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
	if (!Flags.isSRet())
	return NotStructReturn;
	if (Flags.isInReg() \|\| IsMCU)
	return RegStructReturn;
	return StackStructReturn;
	}

	/// Make a copy of an aggregate at address specified by "Src" to address
	/// "Dst" with size and alignment information specified by the specific
	/// parameter attribute. The copy will be passed as a byval function parameter.
	static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
	SDValue Chain, ISD::ArgFlagsTy Flags,
	SelectionDAG &DAG, const SDLoc &dl) {
	SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);

	return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
	/isVolatile/false, /AlwaysInline=/true,
	/isTailCall/false,
	MachinePointerInfo(), MachinePointerInfo());
	}

	/// Return true if the calling convention is one that we can guarantee TCO for.
	static bool canGuaranteeTCO(CallingConv::ID CC) {
	return (CC == CallingConv::Fast \|\| CC == CallingConv::GHC \|\|
	CC == CallingConv::X86_RegCall \|\| CC == CallingConv::HiPE \|\|
	CC == CallingConv::HHVM);
	}

	/// Return true if we might ever do TCO for calls with this calling convention.
	static bool mayTailCallThisCC(CallingConv::ID CC) {
	switch (CC) {
	// C calling conventions:
	case CallingConv::C:
	case CallingConv::Win64:
	case CallingConv::X86_64_SysV:
	// Callee pop conventions:
	case CallingConv::X86_ThisCall:
	case CallingConv::X86_StdCall:
	case CallingConv::X86_VectorCall:
	case CallingConv::X86_FastCall:
	return true;
	default:
	return canGuaranteeTCO(CC);
	}
	}

	/// Return true if the function is being made into a tailcall target by
	/// changing its ABI.
	static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
	return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
	}

	bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
	auto Attr =
	CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
	if (!CI->isTailCall() \|\| Attr.getValueAsString() == "true")
	return false;

	ImmutableCallSite CS(CI);
	CallingConv::ID CalleeCC = CS.getCallingConv();
	if (!mayTailCallThisCC(CalleeCC))
	return false;

	return true;
	}

	SDValue
	X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
	const SmallVectorImpl<ISD::InputArg> &Ins,
	const SDLoc &dl, SelectionDAG &DAG,
	const CCValAssign &VA,
	MachineFrameInfo &MFI, unsigned i) const {
	// Create the nodes corresponding to a load from this parameter slot.
	ISD::ArgFlagsTy Flags = Ins[i].Flags;
	bool AlwaysUseMutable = shouldGuaranteeTCO(
	CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
	bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
	EVT ValVT;
	MVT PtrVT = getPointerTy(DAG.getDataLayout());

	// If value is passed by pointer we have address passed instead of the value
	// itself. No need to extend if the mask value and location share the same
	// absolute size.
	bool ExtendedInMem =
	VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
	VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();

	if (VA.getLocInfo() == CCValAssign::Indirect \|\| ExtendedInMem)
	ValVT = VA.getLocVT();
	else
	ValVT = VA.getValVT();

	// Calculate SP offset of interrupt parameter, re-arrange the slot normally
	// taken by a return address.
	int Offset = 0;
	if (CallConv == CallingConv::X86_INTR) {
	// X86 interrupts may take one or two arguments.
	// On the stack there will be no return address as in regular call.
	// Offset of last argument need to be set to -4/-8 bytes.
	// Where offset of the first argument out of two, should be set to 0 bytes.
	Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
	if (Subtarget.is64Bit() && Ins.size() == 2) {
	// The stack pointer needs to be realigned for 64 bit handlers with error
	// code, so the argument offset changes by 8 bytes.
	Offset += 8;
	}
	}

	// FIXME: For now, all byval parameter objects are marked mutable. This can be
	// changed with more analysis.
	// In case of tail call optimization mark all arguments mutable. Since they
	// could be overwritten by lowering of arguments in case of a tail call.
	if (Flags.isByVal()) {
	unsigned Bytes = Flags.getByValSize();
	if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
	int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
	// Adjust SP offset of interrupt parameter.
	if (CallConv == CallingConv::X86_INTR) {
	MFI.setObjectOffset(FI, Offset);
	}
	return DAG.getFrameIndex(FI, PtrVT);
	}

	// This is an argument in memory. We might be able to perform copy elision.
	if (Flags.isCopyElisionCandidate()) {
	EVT ArgVT = Ins[i].ArgVT;
	SDValue PartAddr;
	if (Ins[i].PartOffset == 0) {
	// If this is a one-part value or the first part of a multi-part value,
	// create a stack object for the entire argument value type and return a
	// load from our portion of it. This assumes that if the first part of an
	// argument is in memory, the rest will also be in memory.
	int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
	/Immutable=/false);
	PartAddr = DAG.getFrameIndex(FI, PtrVT);
	return DAG.getLoad(
	ValVT, dl, Chain, PartAddr,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
	} else {
	// This is not the first piece of an argument in memory. See if there is
	// already a fixed stack object including this offset. If so, assume it
	// was created by the PartOffset == 0 branch above and create a load from
	// the appropriate offset into it.
	int64_t PartBegin = VA.getLocMemOffset();
	int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
	int FI = MFI.getObjectIndexBegin();
	for (; MFI.isFixedObjectIndex(FI); ++FI) {
	int64_t ObjBegin = MFI.getObjectOffset(FI);
	int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
	if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
	break;
	}
	if (MFI.isFixedObjectIndex(FI)) {
	SDValue Addr =
	DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
	DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
	return DAG.getLoad(
	ValVT, dl, Chain, Addr,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
	Ins[i].PartOffset));
	}
	}
	}

	int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
	VA.getLocMemOffset(), isImmutable);

	// Set SExt or ZExt flag.
	if (VA.getLocInfo() == CCValAssign::ZExt) {
	MFI.setObjectZExt(FI, true);
	} else if (VA.getLocInfo() == CCValAssign::SExt) {
	MFI.setObjectSExt(FI, true);
	}

	// Adjust SP offset of interrupt parameter.
	if (CallConv == CallingConv::X86_INTR) {
	MFI.setObjectOffset(FI, Offset);
	}

	SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
	SDValue Val = DAG.getLoad(
	ValVT, dl, Chain, FIN,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
	return ExtendedInMem
	? (VA.getValVT().isVector()
	? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
	: DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
	: Val;
	}

	// FIXME: Get this from tablegen.
	static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
	const X86Subtarget &Subtarget) {
	assert(Subtarget.is64Bit());

	if (Subtarget.isCallingConvWin64(CallConv)) {
	static const MCPhysReg GPR64ArgRegsWin64[] = {
	X86::RCX, X86::RDX, X86::R8, X86::R9
	};
	return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
	}

	static const MCPhysReg GPR64ArgRegs64Bit[] = {
	X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
	};
	return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
	}

	// FIXME: Get this from tablegen.
	static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
	CallingConv::ID CallConv,
	const X86Subtarget &Subtarget) {
	assert(Subtarget.is64Bit());
	if (Subtarget.isCallingConvWin64(CallConv)) {
	// The XMM registers which might contain var arg parameters are shadowed
	// in their paired GPR. So we only need to save the GPR to their home
	// slots.
	// TODO: __vectorcall will change this.
	return None;
	}

	const Function *Fn = MF.getFunction();
	bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat);
	bool isSoftFloat = Subtarget.useSoftFloat();
	assert(!(isSoftFloat && NoImplicitFloatOps) &&
	"SSE register cannot be used when SSE is disabled!");
	if (isSoftFloat \|\| NoImplicitFloatOps \|\| !Subtarget.hasSSE1())
	// Kernel mode asks for SSE to be disabled, so there are no XMM argument
	// registers.
	return None;

	static const MCPhysReg XMMArgRegs64Bit[] = {
	X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
	X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
	};
	return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
	}

	#ifndef NDEBUG
	static bool isSortedByValueNo(const SmallVectorImpl<CCValAssign> &ArgLocs) {
	return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
	[](const CCValAssign &A, const CCValAssign &B) -> bool {
	return A.getValNo() < B.getValNo();
	});
	}
	#endif

	SDValue X86TargetLowering::LowerFormalArguments(
	SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
	MachineFunction &MF = DAG.getMachineFunction();
	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
	const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();

	const Function *Fn = MF.getFunction();
	if (Fn->hasExternalLinkage() &&
	Subtarget.isTargetCygMing() &&
	Fn->getName() == "main")
	FuncInfo->setForceFramePointer(true);

	MachineFrameInfo &MFI = MF.getFrameInfo();
	bool Is64Bit = Subtarget.is64Bit();
	bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);

	assert(
	!(isVarArg && canGuaranteeTCO(CallConv)) &&
	"Var args not supported with calling conv' regcall, fastcc, ghc or hipe");

	if (CallConv == CallingConv::X86_INTR) {
	bool isLegal = Ins.size() == 1 \|\|
	(Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) \|\|
	(!Is64Bit && Ins[1].VT == MVT::i32)));
	if (!isLegal)
	report_fatal_error("X86 interrupts may take one or two arguments");
	}

	// Assign locations to all of the incoming arguments.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());

	// Allocate shadow area for Win64.
	if (IsWin64)
	CCInfo.AllocateStack(32, 8);

	CCInfo.AnalyzeArguments(Ins, CC_X86);

	// In vectorcall calling convention a second pass is required for the HVA
	// types.
	if (CallingConv::X86_VectorCall == CallConv) {
	CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
	}

	// The next loop assumes that the locations are in the same order of the
	// input arguments.
	assert(isSortedByValueNo(ArgLocs) &&
	"Argument Location list must be sorted before lowering");

	SDValue ArgValue;
	for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
	++I, ++InsIndex) {
	assert(InsIndex < Ins.size() && "Invalid Ins index");
	CCValAssign &VA = ArgLocs[I];

	if (VA.isRegLoc()) {
	EVT RegVT = VA.getLocVT();
	if (VA.needsCustom()) {
	assert(
	VA.getValVT() == MVT::v64i1 &&
	"Currently the only custom case is when we split v64i1 to 2 regs");

	// v64i1 values, in regcall calling convention, that are
	// compiled to 32 bit arch, are split up into two registers.
	ArgValue =
	getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
	} else {
	const TargetRegisterClass *RC;
	if (RegVT == MVT::i32)
	RC = &X86::GR32RegClass;
	else if (Is64Bit && RegVT == MVT::i64)
	RC = &X86::GR64RegClass;
	else if (RegVT == MVT::f32)
	RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
	else if (RegVT == MVT::f64)
	RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
	else if (RegVT == MVT::f80)
	RC = &X86::RFP80RegClass;
	else if (RegVT == MVT::f128)
	RC = &X86::FR128RegClass;
	else if (RegVT.is512BitVector())
	RC = &X86::VR512RegClass;
	else if (RegVT.is256BitVector())
	RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
	else if (RegVT.is128BitVector())
	RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
	else if (RegVT == MVT::x86mmx)
	RC = &X86::VR64RegClass;
	else if (RegVT == MVT::v1i1)
	RC = &X86::VK1RegClass;
	else if (RegVT == MVT::v8i1)
	RC = &X86::VK8RegClass;
	else if (RegVT == MVT::v16i1)
	RC = &X86::VK16RegClass;
	else if (RegVT == MVT::v32i1)
	RC = &X86::VK32RegClass;
	else if (RegVT == MVT::v64i1)
	RC = &X86::VK64RegClass;
	else
	llvm_unreachable("Unknown argument type!");

	unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
	ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
	}

	// If this is an 8 or 16-bit value, it is really passed promoted to 32
	// bits. Insert an assert[sz]ext to capture this, then truncate to the
	// right size.
	if (VA.getLocInfo() == CCValAssign::SExt)
	ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
	DAG.getValueType(VA.getValVT()));
	else if (VA.getLocInfo() == CCValAssign::ZExt)
	ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
	DAG.getValueType(VA.getValVT()));
	else if (VA.getLocInfo() == CCValAssign::BCvt)
	ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);

	if (VA.isExtInLoc()) {
	// Handle MMX values passed in XMM regs.
	if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
	ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
	else if (VA.getValVT().isVector() &&
	VA.getValVT().getScalarType() == MVT::i1 &&
	((VA.getLocVT() == MVT::i64) \|\| (VA.getLocVT() == MVT::i32) \|\|
	(VA.getLocVT() == MVT::i16) \|\| (VA.getLocVT() == MVT::i8))) {
	// Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
	ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
	} else
	ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
	}
	} else {
	assert(VA.isMemLoc());
	ArgValue =
	LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
	}

	// If value is passed via pointer - do a load.
	if (VA.getLocInfo() == CCValAssign::Indirect)
	ArgValue =
	DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());

	InVals.push_back(ArgValue);
	}

	for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
	// Swift calling convention does not require we copy the sret argument
	// into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
	if (CallConv == CallingConv::Swift)
	continue;

	// All x86 ABIs require that for returning structs by value we copy the
	// sret argument into %rax/%eax (depending on ABI) for the return. Save
	// the argument into a virtual register so that we can access it from the
	// return points.
	if (Ins[I].Flags.isSRet()) {
	unsigned Reg = FuncInfo->getSRetReturnReg();
	if (!Reg) {
	MVT PtrTy = getPointerTy(DAG.getDataLayout());
	Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
	FuncInfo->setSRetReturnReg(Reg);
	}
	SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
	break;
	}
	}

	unsigned StackSize = CCInfo.getNextStackOffset();
	// Align stack specially for tail calls.
	if (shouldGuaranteeTCO(CallConv,
	MF.getTarget().Options.GuaranteedTailCallOpt))
	StackSize = GetAlignedArgumentStackSize(StackSize, DAG);

	// If the function takes variable number of arguments, make a frame index for
	// the start of the first vararg value... for expansion of llvm.va_start. We
	// can skip this if there are no va_start calls.
	if (MFI.hasVAStart() &&
	(Is64Bit \|\| (CallConv != CallingConv::X86_FastCall &&
	CallConv != CallingConv::X86_ThisCall))) {
	FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
	}

	// Figure out if XMM registers are in use.
	assert(!(Subtarget.useSoftFloat() &&
	Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&
	"SSE register cannot be used when SSE is disabled!");

	// 64-bit calling conventions support varargs and register parameters, so we
	// have to do extra work to spill them in the prologue.
	if (Is64Bit && isVarArg && MFI.hasVAStart()) {
	// Find the first unallocated argument registers.
	ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
	ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
	unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
	unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
	assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
	"SSE register cannot be used when SSE is disabled!");

	// Gather all the live in physical registers.
	SmallVector<SDValue, 6> LiveGPRs;
	SmallVector<SDValue, 8> LiveXMMRegs;
	SDValue ALVal;
	for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
	unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
	LiveGPRs.push_back(
	DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
	}
	if (!ArgXMMs.empty()) {
	unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
	ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
	for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
	unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
	LiveXMMRegs.push_back(
	DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
	}
	}

	if (IsWin64) {
	// Get to the caller-allocated home save location. Add 8 to account
	// for the return address.
	int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
	FuncInfo->setRegSaveFrameIndex(
	MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
	// Fixup to set vararg frame on shadow area (4 x i64).
	if (NumIntRegs < 4)
	FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
	} else {
	// For X86-64, if there are vararg parameters that are passed via
	// registers, then we must store them to their spots on the stack so
	// they may be loaded by dereferencing the result of va_next.
	FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
	FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
	FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject(
	ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
	}

	// Store the integer parameter registers.
	SmallVector<SDValue, 8> MemOps;
	SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
	getPointerTy(DAG.getDataLayout()));
	unsigned Offset = FuncInfo->getVarArgsGPOffset();
	for (SDValue Val : LiveGPRs) {
	SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
	RSFIN, DAG.getIntPtrConstant(Offset, dl));
	SDValue Store =
	DAG.getStore(Val.getValue(1), dl, Val, FIN,
	MachinePointerInfo::getFixedStack(
	DAG.getMachineFunction(),
	FuncInfo->getRegSaveFrameIndex(), Offset));
	MemOps.push_back(Store);
	Offset += 8;
	}

	if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
	// Now store the XMM (fp + vector) parameter registers.
	SmallVector<SDValue, 12> SaveXMMOps;
	SaveXMMOps.push_back(Chain);
	SaveXMMOps.push_back(ALVal);
	SaveXMMOps.push_back(DAG.getIntPtrConstant(
	FuncInfo->getRegSaveFrameIndex(), dl));
	SaveXMMOps.push_back(DAG.getIntPtrConstant(
	FuncInfo->getVarArgsFPOffset(), dl));
	SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
	LiveXMMRegs.end());
	MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
	MVT::Other, SaveXMMOps));
	}

	if (!MemOps.empty())
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
	}

	if (isVarArg && MFI.hasMustTailInVarArgFunc()) {
	// Find the largest legal vector type.
	MVT VecVT = MVT::Other;
	// FIXME: Only some x86_32 calling conventions support AVX512.
	if (Subtarget.hasAVX512() &&
	(Is64Bit \|\| (CallConv == CallingConv::X86_VectorCall \|\|
	CallConv == CallingConv::Intel_OCL_BI)))
	VecVT = MVT::v16f32;
	else if (Subtarget.hasAVX())
	VecVT = MVT::v8f32;
	else if (Subtarget.hasSSE2())
	VecVT = MVT::v4f32;

	// We forward some GPRs and some vector types.
	SmallVector<MVT, 2> RegParmTypes;
	MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
	RegParmTypes.push_back(IntVT);
	if (VecVT != MVT::Other)
	RegParmTypes.push_back(VecVT);

	// Compute the set of forwarded registers. The rest are scratch.
	SmallVectorImpl<ForwardedRegister> &Forwards =
	FuncInfo->getForwardedMustTailRegParms();
	CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);

	// Conservatively forward AL on x86_64, since it might be used for varargs.
	if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
	unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
	Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
	}

	// Copy all forwards from physical to virtual registers.
	for (ForwardedRegister &F : Forwards) {
	// FIXME: Can we use a less constrained schedule?
	SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
	F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
	Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
	}
	}

	// Some CCs need callee pop.
	if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
	MF.getTarget().Options.GuaranteedTailCallOpt)) {
	FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
	} else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
	// X86 interrupts must pop the error code (and the alignment padding) if
	// present.
	FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
	} else {
	FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
	// If this is an sret function, the return should pop the hidden pointer.
	if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
	!Subtarget.getTargetTriple().isOSMSVCRT() &&
	argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
	FuncInfo->setBytesToPopOnReturn(4);
	}

	if (!Is64Bit) {
	// RegSaveFrameIndex is X86-64 only.
	FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
	if (CallConv == CallingConv::X86_FastCall \|\|
	CallConv == CallingConv::X86_ThisCall)
	// fastcc functions can't have varargs.
	FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
	}

	FuncInfo->setArgumentStackSize(StackSize);

	if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
	EHPersonality Personality = classifyEHPersonality(Fn->getPersonalityFn());
	if (Personality == EHPersonality::CoreCLR) {
	assert(Is64Bit);
	// TODO: Add a mechanism to frame lowering that will allow us to indicate
	// that we'd prefer this slot be allocated towards the bottom of the frame
	// (i.e. near the stack pointer after allocating the frame). Every
	// funclet needs a copy of this slot in its (mostly empty) frame, and the
	// offset from the bottom of this and each funclet's frame must be the
	// same, so the size of funclets' (mostly empty) frames is dictated by
	// how far this slot is from the bottom (since they allocate just enough
	// space to accommodate holding this slot at the correct offset).
	int PSPSymFI = MFI.CreateStackObject(8, 8, /isSS=/false);
	EHInfo->PSPSymFrameIdx = PSPSymFI;
	}
	}

	if (CallConv == CallingConv::X86_RegCall \|\|
	Fn->hasFnAttribute("no_caller_saved_registers")) {
	const MachineRegisterInfo &MRI = MF.getRegInfo();
	for (const auto &Pair : make_range(MRI.livein_begin(), MRI.livein_end()))
	MF.getRegInfo().disableCalleeSavedRegister(Pair.first);
	}

	return Chain;
	}

	SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
	SDValue Arg, const SDLoc &dl,
	SelectionDAG &DAG,
	const CCValAssign &VA,
	ISD::ArgFlagsTy Flags) const {
	unsigned LocMemOffset = VA.getLocMemOffset();
	SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
	PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
	StackPtr, PtrOff);
	if (Flags.isByVal())
	return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);

	return DAG.getStore(
	Chain, dl, Arg, PtrOff,
	MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
	}

	/// Emit a load of return address if tail call
	/// optimization is performed and it is required.
	SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
	SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
	bool Is64Bit, int FPDiff, const SDLoc &dl) const {
	// Adjust the Return address stack slot.
	EVT VT = getPointerTy(DAG.getDataLayout());
	OutRetAddr = getReturnAddressFrameIndex(DAG);

	// Load the "old" Return address.
	OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
	return SDValue(OutRetAddr.getNode(), 1);
	}

	/// Emit a store of the return address if tail call
	/// optimization is performed and it is required (FPDiff!=0).
	static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
	SDValue Chain, SDValue RetAddrFrIdx,
	EVT PtrVT, unsigned SlotSize,
	int FPDiff, const SDLoc &dl) {
	// Store the return address to the appropriate stack slot.
	if (!FPDiff) return Chain;
	// Calculate the new stack slot for the return address.
	int NewReturnAddrFI =
	MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
	false);
	SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
	Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
	MachinePointerInfo::getFixedStack(
	DAG.getMachineFunction(), NewReturnAddrFI));
	return Chain;
	}

	/// Returns a vector_shuffle mask for an movs{s\|d}, movd
	/// operation of specified width.
	static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
	SDValue V2) {
	unsigned NumElems = VT.getVectorNumElements();
	SmallVector<int, 8> Mask;
	Mask.push_back(NumElems);
	for (unsigned i = 1; i != NumElems; ++i)
	Mask.push_back(i);
	return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
	}

	SDValue
	X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
	SmallVectorImpl<SDValue> &InVals) const {
	SelectionDAG &DAG = CLI.DAG;
	SDLoc &dl = CLI.DL;
	SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
	SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
	SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
	SDValue Chain = CLI.Chain;
	SDValue Callee = CLI.Callee;
	CallingConv::ID CallConv = CLI.CallConv;
	bool &isTailCall = CLI.IsTailCall;
	bool isVarArg = CLI.IsVarArg;

	MachineFunction &MF = DAG.getMachineFunction();
	bool Is64Bit = Subtarget.is64Bit();
	bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
	StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
	bool IsSibcall = false;
	X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
	auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls");
	const CallInst *CI =
	CLI.CS ? dyn_cast<CallInst>(CLI.CS->getInstruction()) : nullptr;
	const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
	bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) \|\|
	(Fn && Fn->hasFnAttribute("no_caller_saved_registers"));

	if (CallConv == CallingConv::X86_INTR)
	report_fatal_error("X86 interrupts may not be called directly");

	if (Attr.getValueAsString() == "true")
	isTailCall = false;

	if (Subtarget.isPICStyleGOT() &&
	!MF.getTarget().Options.GuaranteedTailCallOpt) {
	// If we are using a GOT, disable tail calls to external symbols with
	// default visibility. Tail calling such a symbol requires using a GOT
	// relocation, which forces early binding of the symbol. This breaks code
	// that require lazy function symbol resolution. Using musttail or
	// GuaranteedTailCallOpt will override this.
	GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
	if (!G \|\| (!G->getGlobal()->hasLocalLinkage() &&
	G->getGlobal()->hasDefaultVisibility()))
	isTailCall = false;
	}

	bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
	if (IsMustTail) {
	// Force this to be a tail call. The verifier rules are enough to ensure
	// that we can lower this successfully without moving the return address
	// around.
	isTailCall = true;
	} else if (isTailCall) {
	// Check if it's really possible to do a tail call.
	isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
	isVarArg, SR != NotStructReturn,
	MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
	Outs, OutVals, Ins, DAG);

	// Sibcalls are automatically detected tailcalls which do not require
	// ABI changes.
	if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
	IsSibcall = true;

	if (isTailCall)
	++NumTailCalls;
	}

	assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
	"Var args not supported with calling convention fastcc, ghc or hipe");

	// Analyze operands of the call, assigning locations to each operand.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());

	// Allocate shadow area for Win64.
	if (IsWin64)
	CCInfo.AllocateStack(32, 8);

	CCInfo.AnalyzeArguments(Outs, CC_X86);

	// In vectorcall calling convention a second pass is required for the HVA
	// types.
	if (CallingConv::X86_VectorCall == CallConv) {
	CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
	}

	// Get a count of how many bytes are to be pushed on the stack.
	unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
	if (IsSibcall)
	// This is a sibcall. The memory operands are available in caller's
	// own caller's stack.
	NumBytes = 0;
	else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
	canGuaranteeTCO(CallConv))
	NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);

	int FPDiff = 0;
	if (isTailCall && !IsSibcall && !IsMustTail) {
	// Lower arguments at fp - stackoffset + fpdiff.
	unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();

	FPDiff = NumBytesCallerPushed - NumBytes;

	// Set the delta of movement of the returnaddr stackslot.
	// But only set if delta is greater than previous delta.
	if (FPDiff < X86Info->getTCReturnAddrDelta())
	X86Info->setTCReturnAddrDelta(FPDiff);
	}

	unsigned NumBytesToPush = NumBytes;
	unsigned NumBytesToPop = NumBytes;

	// If we have an inalloca argument, all stack space has already been allocated
	// for us and be right at the top of the stack. We don't support multiple
	// arguments passed in memory when using inalloca.
	if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
	NumBytesToPush = 0;
	if (!ArgLocs.back().isMemLoc())
	report_fatal_error("cannot use inalloca attribute on a register "
	"parameter");
	if (ArgLocs.back().getLocMemOffset() != 0)
	report_fatal_error("any parameter with the inalloca attribute must be "
	"the only memory argument");
	}

	if (!IsSibcall)
	Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
	NumBytes - NumBytesToPush, dl);

	SDValue RetAddrFrIdx;
	// Load return address for tail calls.
	if (isTailCall && FPDiff)
	Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
	Is64Bit, FPDiff, dl);

	SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
	SmallVector<SDValue, 8> MemOpChains;
	SDValue StackPtr;

	// The next loop assumes that the locations are in the same order of the
	// input arguments.
	assert(isSortedByValueNo(ArgLocs) &&
	"Argument Location list must be sorted before lowering");

	// Walk the register/memloc assignments, inserting copies/loads. In the case
	// of tail call optimization arguments are handle later.
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
	++I, ++OutIndex) {
	assert(OutIndex < Outs.size() && "Invalid Out index");
	// Skip inalloca arguments, they have already been written.
	ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
	if (Flags.isInAlloca())
	continue;

	CCValAssign &VA = ArgLocs[I];
	EVT RegVT = VA.getLocVT();
	SDValue Arg = OutVals[OutIndex];
	bool isByVal = Flags.isByVal();

	// Promote the value if needed.
	switch (VA.getLocInfo()) {
	default: llvm_unreachable("Unknown loc info!");
	case CCValAssign::Full: break;
	case CCValAssign::SExt:
	Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
	break;
	case CCValAssign::ZExt:
	Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
	break;
	case CCValAssign::AExt:
	if (Arg.getValueType().isVector() &&
	Arg.getValueType().getVectorElementType() == MVT::i1)
	Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
	else if (RegVT.is128BitVector()) {
	// Special case: passing MMX values in XMM registers.
	Arg = DAG.getBitcast(MVT::i64, Arg);
	Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
	Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
	} else
	Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
	break;
	case CCValAssign::BCvt:
	Arg = DAG.getBitcast(RegVT, Arg);
	break;
	case CCValAssign::Indirect: {
	// Store the argument.
	SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
	int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
	Chain = DAG.getStore(
	Chain, dl, Arg, SpillSlot,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
	Arg = SpillSlot;
	break;
	}
	}

	if (VA.needsCustom()) {
	assert(VA.getValVT() == MVT::v64i1 &&
	"Currently the only custom case is when we split v64i1 to 2 regs");
	// Split v64i1 value into two registers
	Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I],
	Subtarget);
	} else if (VA.isRegLoc()) {
	RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
	if (isVarArg && IsWin64) {
	// Win64 ABI requires argument XMM reg to be copied to the corresponding
	// shadow reg if callee is a varargs function.
	unsigned ShadowReg = 0;
	switch (VA.getLocReg()) {
	case X86::XMM0: ShadowReg = X86::RCX; break;
	case X86::XMM1: ShadowReg = X86::RDX; break;
	case X86::XMM2: ShadowReg = X86::R8; break;
	case X86::XMM3: ShadowReg = X86::R9; break;
	}
	if (ShadowReg)
	RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
	}
	} else if (!IsSibcall && (!isTailCall \|\| isByVal)) {
	assert(VA.isMemLoc());
	if (!StackPtr.getNode())
	StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
	getPointerTy(DAG.getDataLayout()));
	MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
	dl, DAG, VA, Flags));
	}
	}

	if (!MemOpChains.empty())
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);

	if (Subtarget.isPICStyleGOT()) {
	// ELF / PIC requires GOT in the EBX register before function calls via PLT
	// GOT pointer.
	if (!isTailCall) {
	RegsToPass.push_back(std::make_pair(
	unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
	getPointerTy(DAG.getDataLayout()))));
	} else {
	// If we are tail calling and generating PIC/GOT style code load the
	// address of the callee into ECX. The value in ecx is used as target of
	// the tail jump. This is done to circumvent the ebx/callee-saved problem
	// for tail calls on PIC/GOT architectures. Normally we would just put the
	// address of GOT into ebx and then call target@PLT. But for tail calls
	// ebx would be restored (since ebx is callee saved) before jumping to the
	// target@PLT.

	// Note: The actual moving to ECX is done further down.
	GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
	if (G && !G->getGlobal()->hasLocalLinkage() &&
	G->getGlobal()->hasDefaultVisibility())
	Callee = LowerGlobalAddress(Callee, DAG);
	else if (isa<ExternalSymbolSDNode>(Callee))
	Callee = LowerExternalSymbol(Callee, DAG);
	}
	}

	if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
	// From AMD64 ABI document:
	// For calls that may call functions that use varargs or stdargs
	// (prototype-less calls or calls to functions containing ellipsis (...) in
	// the declaration) %al is used as hidden argument to specify the number
	// of SSE registers used. The contents of %al do not need to match exactly
	// the number of registers, but must be an ubound on the number of SSE
	// registers used and is in the range 0 - 8 inclusive.

	// Count the number of XMM registers allocated.
	static const MCPhysReg XMMArgRegs[] = {
	X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
	X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
	};
	unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
	assert((Subtarget.hasSSE1() \|\| !NumXMMRegs)
	&& "SSE registers cannot be used when SSE is disabled");

	RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
	DAG.getConstant(NumXMMRegs, dl,
	MVT::i8)));
	}

	if (isVarArg && IsMustTail) {
	const auto &Forwards = X86Info->getForwardedMustTailRegParms();
	for (const auto &F : Forwards) {
	SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
	RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
	}
	}

	// For tail calls lower the arguments to the 'real' stack slots. Sibcalls
	// don't need this because the eligibility check rejects calls that require
	// shuffling arguments passed in memory.
	if (!IsSibcall && isTailCall) {
	// Force all the incoming stack arguments to be loaded from the stack
	// before any new outgoing arguments are stored to the stack, because the
	// outgoing stack slots may alias the incoming argument stack slots, and
	// the alias isn't otherwise explicit. This is slightly more conservative
	// than necessary, because it means that each store effectively depends
	// on every argument instead of just those arguments it would clobber.
	SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);

	SmallVector<SDValue, 8> MemOpChains2;
	SDValue FIN;
	int FI = 0;
	for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
	++I, ++OutsIndex) {
	CCValAssign &VA = ArgLocs[I];

	if (VA.isRegLoc()) {
	if (VA.needsCustom()) {
	assert((CallConv == CallingConv::X86_RegCall) &&
	"Expecting custom case only in regcall calling convention");
	// This means that we are in special case where one argument was
	// passed through two register locations - Skip the next location
	++I;
	}

	continue;
	}

	assert(VA.isMemLoc());
	SDValue Arg = OutVals[OutsIndex];
	ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
	// Skip inalloca arguments. They don't require any work.
	if (Flags.isInAlloca())
	continue;
	// Create frame index.
	int32_t Offset = VA.getLocMemOffset()+FPDiff;
	uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
	FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
	FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));

	if (Flags.isByVal()) {
	// Copy relative to framepointer.
	SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
	if (!StackPtr.getNode())
	StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
	getPointerTy(DAG.getDataLayout()));
	Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
	StackPtr, Source);

	MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
	ArgChain,
	Flags, DAG, dl));
	} else {
	// Store relative to framepointer.
	MemOpChains2.push_back(DAG.getStore(
	ArgChain, dl, Arg, FIN,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
	}
	}

	if (!MemOpChains2.empty())
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);

	// Store the return address to the appropriate stack slot.
	Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
	getPointerTy(DAG.getDataLayout()),
	RegInfo->getSlotSize(), FPDiff, dl);
	}

	// Build a sequence of copy-to-reg nodes chained together with token chain
	// and flag operands which copy the outgoing args into registers.
	SDValue InFlag;
	for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
	Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
	RegsToPass[i].second, InFlag);
	InFlag = Chain.getValue(1);
	}

	if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
	assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
	// In the 64-bit large code model, we have to make all calls
	// through a register, since the call instruction's 32-bit
	// pc-relative offset may not be large enough to hold the whole
	// address.
	} else if (Callee->getOpcode() == ISD::GlobalAddress) {
	// If the callee is a GlobalAddress node (quite common, every direct call
	// is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
	// it.
	GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);

	// We should use extra load for direct calls to dllimported functions in
	// non-JIT mode.
	const GlobalValue *GV = G->getGlobal();
	if (!GV->hasDLLImportStorageClass()) {
	unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV);

	Callee = DAG.getTargetGlobalAddress(
	GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);

	if (OpFlags == X86II::MO_GOTPCREL) {
	// Add a wrapper.
	Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
	getPointerTy(DAG.getDataLayout()), Callee);
	// Add extra indirection
	Callee = DAG.getLoad(
	getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
	MachinePointerInfo::getGOT(DAG.getMachineFunction()));
	}
	}
	} else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
	const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
	unsigned char OpFlags =
	Subtarget.classifyGlobalFunctionReference(nullptr, *Mod);

	Callee = DAG.getTargetExternalSymbol(
	S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
	} else if (Subtarget.isTarget64BitILP32() &&
	Callee->getValueType(0) == MVT::i32) {
	// Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
	Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
	}

	// Returns a chain & a flag for retval copy to use.
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
	SmallVector<SDValue, 8> Ops;

	if (!IsSibcall && isTailCall) {
	Chain = DAG.getCALLSEQ_END(Chain,
	DAG.getIntPtrConstant(NumBytesToPop, dl, true),
	DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
	InFlag = Chain.getValue(1);
	}

	Ops.push_back(Chain);
	Ops.push_back(Callee);

	if (isTailCall)
	Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));

	// Add argument registers to the end of the list so that they are known live
	// into the call.
	for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
	Ops.push_back(DAG.getRegister(RegsToPass[i].first,
	RegsToPass[i].second.getValueType()));

	// Add a register mask operand representing the call-preserved registers.
	// If HasNCSR is asserted (attribute NoCallerSavedRegisters exists) then we
	// set X86_INTR calling convention because it has the same CSR mask
	// (same preserved registers).
	const uint32_t *Mask = RegInfo->getCallPreservedMask(
	MF, HasNCSR ? (CallingConv::ID)CallingConv::X86_INTR : CallConv);
	assert(Mask && "Missing call preserved mask for calling convention");

	// If this is an invoke in a 32-bit function using a funclet-based
	// personality, assume the function clobbers all registers. If an exception
	// is thrown, the runtime will not restore CSRs.
	// FIXME: Model this more precisely so that we can register allocate across
	// the normal edge and spill and fill across the exceptional edge.
	if (!Is64Bit && CLI.CS && CLI.CS->isInvoke()) {
	const Function *CallerFn = MF.getFunction();
	EHPersonality Pers =
	CallerFn->hasPersonalityFn()
	? classifyEHPersonality(CallerFn->getPersonalityFn())
	: EHPersonality::Unknown;
	if (isFuncletEHPersonality(Pers))
	Mask = RegInfo->getNoPreservedMask();
	}

	// Define a new register mask from the existing mask.
	uint32_t *RegMask = nullptr;

	// In some calling conventions we need to remove the used physical registers
	// from the reg mask.
	if (CallConv == CallingConv::X86_RegCall \|\| HasNCSR) {
	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

	// Allocate a new Reg Mask and copy Mask.
	RegMask = MF.allocateRegisterMask(TRI->getNumRegs());
	unsigned RegMaskSize = (TRI->getNumRegs() + 31) / 32;
	memcpy(RegMask, Mask, sizeof(uint32_t) * RegMaskSize);

	// Make sure all sub registers of the argument registers are reset
	// in the RegMask.
	for (auto const &RegPair : RegsToPass)
	for (MCSubRegIterator SubRegs(RegPair.first, TRI, /IncludeSelf=/true);
	SubRegs.isValid(); ++SubRegs)
	RegMask[SubRegs / 32] &= ~(1u << (SubRegs % 32));

	// Create the RegMask Operand according to our updated mask.
	Ops.push_back(DAG.getRegisterMask(RegMask));
	} else {
	// Create the RegMask Operand according to the static mask.
	Ops.push_back(DAG.getRegisterMask(Mask));
	}

	if (InFlag.getNode())
	Ops.push_back(InFlag);

	if (isTailCall) {
	// We used to do:
	//// If this is the first return lowered for this function, add the regs
	//// to the liveout set for the function.
	// This isn't right, although it's probably harmless on x86; liveouts
	// should be computed from returns not tail calls. Consider a void
	// function making a tail call to a function returning int.
	MF.getFrameInfo().setHasTailCall();
	return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
	}

	Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
	InFlag = Chain.getValue(1);

	// Create the CALLSEQ_END node.
	unsigned NumBytesForCalleeToPop;
	if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
	DAG.getTarget().Options.GuaranteedTailCallOpt))
	NumBytesForCalleeToPop = NumBytes; // Callee pops everything
	else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
	!Subtarget.getTargetTriple().isOSMSVCRT() &&
	SR == StackStructReturn)
	// If this is a call to a struct-return function, the callee
	// pops the hidden struct pointer, so we have to push it back.
	// This is common for Darwin/X86, Linux & Mingw32 targets.
	// For MSVC Win32 targets, the caller pops the hidden struct pointer.
	NumBytesForCalleeToPop = 4;
	else
	NumBytesForCalleeToPop = 0; // Callee pops nothing.

	if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
	// No need to reset the stack after the call if the call doesn't return. To
	// make the MI verify, we'll pretend the callee does it for us.
	NumBytesForCalleeToPop = NumBytes;
	}

	// Returns a flag for retval copy to use.
	if (!IsSibcall) {
	Chain = DAG.getCALLSEQ_END(Chain,
	DAG.getIntPtrConstant(NumBytesToPop, dl, true),
	DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
	true),
	InFlag, dl);
	InFlag = Chain.getValue(1);
	}

	// Handle result values, copying them out of physregs into vregs that we
	// return.
	return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
	InVals, RegMask);
	}

	//===----------------------------------------------------------------------===//
	// Fast Calling Convention (tail call) implementation
	//===----------------------------------------------------------------------===//

	// Like std call, callee cleans arguments, convention except that ECX is
	// reserved for storing the tail called function address. Only 2 registers are
	// free for argument passing (inreg). Tail call optimization is performed
	// provided:
	// * tailcallopt is enabled
	// * caller/callee are fastcc
	// On X86_64 architecture with GOT-style position independent code only local
	// (within module) calls are supported at the moment.
	// To keep the stack aligned according to platform abi the function
	// GetAlignedArgumentStackSize ensures that argument delta is always multiples
	// of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
	// If a tail called function callee has more arguments than the caller the
	// caller needs to make sure that there is room to move the RETADDR to. This is
	// achieved by reserving an area the size of the argument delta right after the
	// original RETADDR, but before the saved framepointer or the spilled registers
	// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
	// stack layout:
	// arg1
	// arg2
	// RETADDR
	// [ new RETADDR
	// move area ]
	// (possible EBP)
	// ESI
	// EDI
	// local1 ..

	/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
	/// requirement.
	unsigned
	X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
	SelectionDAG& DAG) const {
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
	unsigned StackAlignment = TFI.getStackAlignment();
	uint64_t AlignMask = StackAlignment - 1;
	int64_t Offset = StackSize;
	unsigned SlotSize = RegInfo->getSlotSize();
	if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
	// Number smaller than 12 so just add the difference.
	Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
	} else {
	// Mask out lower bits, add stackalignment once plus the 12 bytes.
	Offset = ((~AlignMask) & Offset) + StackAlignment +
	(StackAlignment-SlotSize);
	}
	return Offset;
	}

	/// Return true if the given stack call argument is already available in the
	/// same position (relatively) of the caller's incoming argument stack.
	static
	bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
	MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
	const X86InstrInfo *TII, const CCValAssign &VA) {
	unsigned Bytes = Arg.getValueSizeInBits() / 8;

	for (;;) {
	// Look through nodes that don't alter the bits of the incoming value.
	unsigned Op = Arg.getOpcode();
	if (Op == ISD::ZERO_EXTEND \|\| Op == ISD::ANY_EXTEND \|\| Op == ISD::BITCAST) {
	Arg = Arg.getOperand(0);
	continue;
	}
	if (Op == ISD::TRUNCATE) {
	const SDValue &TruncInput = Arg.getOperand(0);
	if (TruncInput.getOpcode() == ISD::AssertZext &&
	cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
	Arg.getValueType()) {
	Arg = TruncInput.getOperand(0);
	continue;
	}
	}
	break;
	}

	int FI = INT_MAX;
	if (Arg.getOpcode() == ISD::CopyFromReg) {
	unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
	if (!TargetRegisterInfo::isVirtualRegister(VR))
	return false;
	MachineInstr *Def = MRI->getVRegDef(VR);
	if (!Def)
	return false;
	if (!Flags.isByVal()) {
	if (!TII->isLoadFromStackSlot(*Def, FI))
	return false;
	} else {
	unsigned Opcode = Def->getOpcode();
	if ((Opcode == X86::LEA32r \|\| Opcode == X86::LEA64r \|\|
	Opcode == X86::LEA64_32r) &&
	Def->getOperand(1).isFI()) {
	FI = Def->getOperand(1).getIndex();
	Bytes = Flags.getByValSize();
	} else
	return false;
	}
	} else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
	if (Flags.isByVal())
	// ByVal argument is passed in as a pointer but it's now being
	// dereferenced. e.g.
	// define @foo(%struct.X* %A) {
	// tail call @bar(%struct.X* byval %A)
	// }
	return false;
	SDValue Ptr = Ld->getBasePtr();
	FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
	if (!FINode)
	return false;
	FI = FINode->getIndex();
	} else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
	FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
	FI = FINode->getIndex();
	Bytes = Flags.getByValSize();
	} else
	return false;

	assert(FI != INT_MAX);
	if (!MFI.isFixedObjectIndex(FI))
	return false;

	if (Offset != MFI.getObjectOffset(FI))
	return false;

	// If this is not byval, check that the argument stack object is immutable.
	// inalloca and argument copy elision can create mutable argument stack
	// objects. Byval objects can be mutated, but a byval call intends to pass the
	// mutated memory.
	if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
	return false;

	if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {
	// If the argument location is wider than the argument type, check that any
	// extension flags match.
	if (Flags.isZExt() != MFI.isObjectZExt(FI) \|\|
	Flags.isSExt() != MFI.isObjectSExt(FI)) {
	return false;
	}
	}

	return Bytes == MFI.getObjectSize(FI);
	}

	/// Check whether the call is eligible for tail call optimization. Targets
	/// that want to do tail call optimization should implement this function.
	bool X86TargetLowering::IsEligibleForTailCallOptimization(
	SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
	bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
	if (!mayTailCallThisCC(CalleeCC))
	return false;

	// If -tailcallopt is specified, make fastcc functions tail-callable.
	MachineFunction &MF = DAG.getMachineFunction();
	const Function *CallerF = MF.getFunction();

	// If the function return type is x86_fp80 and the callee return type is not,
	// then the FP_EXTEND of the call result is not a nop. It's not safe to
	// perform a tailcall optimization here.
	if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
	return false;

	CallingConv::ID CallerCC = CallerF->getCallingConv();
	bool CCMatch = CallerCC == CalleeCC;
	bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
	bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);

	// Win64 functions have extra shadow space for argument homing. Don't do the
	// sibcall if the caller and callee have mismatched expectations for this
	// space.
	if (IsCalleeWin64 != IsCallerWin64)
	return false;

	if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
	if (canGuaranteeTCO(CalleeCC) && CCMatch)
	return true;
	return false;
	}

	// Look for obvious safe cases to perform tail call optimization that do not
	// require ABI changes. This is what gcc calls sibcall.

	// Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
	// emit a special epilogue.
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	if (RegInfo->needsStackRealignment(MF))
	return false;

	// Also avoid sibcall optimization if either caller or callee uses struct
	// return semantics.
	if (isCalleeStructRet \|\| isCallerStructRet)
	return false;

	// Do not sibcall optimize vararg calls unless all arguments are passed via
	// registers.
	LLVMContext &C = *DAG.getContext();
	if (isVarArg && !Outs.empty()) {
	// Optimizing for varargs on Win64 is unlikely to be safe without
	// additional testing.
	if (IsCalleeWin64 \|\| IsCallerWin64)
	return false;

	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);

	CCInfo.AnalyzeCallOperands(Outs, CC_X86);
	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
	if (!ArgLocs[i].isRegLoc())
	return false;
	}

	// If the call result is in ST0 / ST1, it needs to be popped off the x87
	// stack. Therefore, if it's not used by the call it is not safe to optimize
	// this into a sibcall.
	bool Unused = false;
	for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
	if (!Ins[i].Used) {
	Unused = true;
	break;
	}
	}
	if (Unused) {
	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
	CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
	for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
	CCValAssign &VA = RVLocs[i];
	if (VA.getLocReg() == X86::FP0 \|\| VA.getLocReg() == X86::FP1)
	return false;
	}
	}

	// Check that the call results are passed in the same way.
	if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
	RetCC_X86, RetCC_X86))
	return false;
	// The callee has to preserve all registers the caller needs to preserve.
	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
	const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
	if (!CCMatch) {
	const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
	if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
	return false;
	}

	unsigned StackArgsSize = 0;

	// If the callee takes no arguments then go on to check the results of the
	// call.
	if (!Outs.empty()) {
	// Check if stack adjustment is needed. For now, do not do this if any
	// argument is passed on the stack.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);

	// Allocate shadow area for Win64
	if (IsCalleeWin64)
	CCInfo.AllocateStack(32, 8);

	CCInfo.AnalyzeCallOperands(Outs, CC_X86);
	StackArgsSize = CCInfo.getNextStackOffset();

	if (CCInfo.getNextStackOffset()) {
	// Check if the arguments are already laid out in the right way as
	// the caller's fixed stack objects.
	MachineFrameInfo &MFI = MF.getFrameInfo();
	const MachineRegisterInfo *MRI = &MF.getRegInfo();
	const X86InstrInfo *TII = Subtarget.getInstrInfo();
	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
	CCValAssign &VA = ArgLocs[i];
	SDValue Arg = OutVals[i];
	ISD::ArgFlagsTy Flags = Outs[i].Flags;
	if (VA.getLocInfo() == CCValAssign::Indirect)
	return false;
	if (!VA.isRegLoc()) {
	if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
	MFI, MRI, TII, VA))
	return false;
	}
	}
	}

	bool PositionIndependent = isPositionIndependent();
	// If the tailcall address may be in a register, then make sure it's
	// possible to register allocate for it. In 32-bit, the call address can
	// only target EAX, EDX, or ECX since the tail call must be scheduled after
	// callee-saved registers are restored. These happen to be the same
	// registers used to pass 'inreg' arguments so watch out for those.
	if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
	!isa<ExternalSymbolSDNode>(Callee)) \|\|
	PositionIndependent)) {
	unsigned NumInRegs = 0;
	// In PIC we need an extra register to formulate the address computation
	// for the callee.
	unsigned MaxInRegs = PositionIndependent ? 2 : 3;

	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
	CCValAssign &VA = ArgLocs[i];
	if (!VA.isRegLoc())
	continue;
	unsigned Reg = VA.getLocReg();
	switch (Reg) {
	default: break;
	case X86::EAX: case X86::EDX: case X86::ECX:
	if (++NumInRegs == MaxInRegs)
	return false;
	break;
	}
	}
	}

	const MachineRegisterInfo &MRI = MF.getRegInfo();
	if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
	return false;
	}

	bool CalleeWillPop =
	X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
	MF.getTarget().Options.GuaranteedTailCallOpt);

	if (unsigned BytesToPop =
	MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
	// If we have bytes to pop, the callee must pop them.
	bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
	if (!CalleePopMatches)
	return false;
	} else if (CalleeWillPop && StackArgsSize > 0) {
	// If we don't have bytes to pop, make sure the callee doesn't pop any.
	return false;
	}

	return true;
	}

	FastISel *
	X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
	const TargetLibraryInfo *libInfo) const {
	return X86::createFastISel(funcInfo, libInfo);
	}

	//===----------------------------------------------------------------------===//
	// Other Lowering Hooks
	//===----------------------------------------------------------------------===//

	static bool MayFoldLoad(SDValue Op) {
	return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
	}

	static bool MayFoldIntoStore(SDValue Op) {
	return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
	}

	static bool MayFoldIntoZeroExtend(SDValue Op) {
	if (Op.hasOneUse()) {
	unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
	return (ISD::ZERO_EXTEND == Opcode);
	}
	return false;
	}

	static bool isTargetShuffle(unsigned Opcode) {
	switch(Opcode) {
	default: return false;
	case X86ISD::BLENDI:
	case X86ISD::PSHUFB:
	case X86ISD::PSHUFD:
	case X86ISD::PSHUFHW:
	case X86ISD::PSHUFLW:
	case X86ISD::SHUFP:
	case X86ISD::INSERTPS:
	case X86ISD::EXTRQI:
	case X86ISD::INSERTQI:
	case X86ISD::PALIGNR:
	case X86ISD::VSHLDQ:
	case X86ISD::VSRLDQ:
	case X86ISD::MOVLHPS:
	case X86ISD::MOVLHPD:
	case X86ISD::MOVHLPS:
	case X86ISD::MOVLPS:
	case X86ISD::MOVLPD:
	case X86ISD::MOVSHDUP:
	case X86ISD::MOVSLDUP:
	case X86ISD::MOVDDUP:
	case X86ISD::MOVSS:
	case X86ISD::MOVSD:
	case X86ISD::UNPCKL:
	case X86ISD::UNPCKH:
	case X86ISD::VBROADCAST:
	case X86ISD::VPERMILPI:
	case X86ISD::VPERMILPV:
	case X86ISD::VPERM2X128:
	case X86ISD::VPERMIL2:
	case X86ISD::VPERMI:
	case X86ISD::VPPERM:
	case X86ISD::VPERMV:
	case X86ISD::VPERMV3:
	case X86ISD::VPERMIV3:
	case X86ISD::VZEXT_MOVL:
	return true;
	}
	}

	static bool isTargetShuffleVariableMask(unsigned Opcode) {
	switch (Opcode) {
	default: return false;
	// Target Shuffles.
	case X86ISD::PSHUFB:
	case X86ISD::VPERMILPV:
	case X86ISD::VPERMIL2:
	case X86ISD::VPPERM:
	case X86ISD::VPERMV:
	case X86ISD::VPERMV3:
	case X86ISD::VPERMIV3:
	return true;
	// 'Faux' Target Shuffles.
	case ISD::AND:
	case X86ISD::ANDNP:
	return true;
	}
	}

	SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
	int ReturnAddrIndex = FuncInfo->getRAIndex();

	if (ReturnAddrIndex == 0) {
	// Set up a frame object for the return address.
	unsigned SlotSize = RegInfo->getSlotSize();
	ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
	-(int64_t)SlotSize,
	false);
	FuncInfo->setRAIndex(ReturnAddrIndex);
	}

	return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
	}

	bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
	bool hasSymbolicDisplacement) {
	// Offset should fit into 32 bit immediate field.
	if (!isInt<32>(Offset))
	return false;

	// If we don't have a symbolic displacement - we don't have any extra
	// restrictions.
	if (!hasSymbolicDisplacement)
	return true;

	// FIXME: Some tweaks might be needed for medium code model.
	if (M != CodeModel::Small && M != CodeModel::Kernel)
	return false;

	// For small code model we assume that latest object is 16MB before end of 31
	// bits boundary. We may also accept pretty large negative constants knowing
	// that all objects are in the positive half of address space.
	if (M == CodeModel::Small && Offset < 1610241024)
	return true;

	// For kernel code model we know that all object resist in the negative half
	// of 32bits address space. We may not accept negative offsets, since they may
	// be just off and we may accept pretty large positive ones.
	if (M == CodeModel::Kernel && Offset >= 0)
	return true;

	return false;
	}

	/// Determines whether the callee is required to pop its own arguments.
	/// Callee pop is necessary to support tail calls.
	bool X86::isCalleePop(CallingConv::ID CallingConv,
	bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
	// If GuaranteeTCO is true, we force some calls to be callee pop so that we
	// can guarantee TCO.
	if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
	return true;

	switch (CallingConv) {
	default:
	return false;
	case CallingConv::X86_StdCall:
	case CallingConv::X86_FastCall:
	case CallingConv::X86_ThisCall:
	case CallingConv::X86_VectorCall:
	return !is64Bit;
	}
	}

	/// \brief Return true if the condition is an unsigned comparison operation.
	static bool isX86CCUnsigned(unsigned X86CC) {
	switch (X86CC) {
	default:
	llvm_unreachable("Invalid integer condition!");
	case X86::COND_E:
	case X86::COND_NE:
	case X86::COND_B:
	case X86::COND_A:
	case X86::COND_BE:
	case X86::COND_AE:
	return true;
	case X86::COND_G:
	case X86::COND_GE:
	case X86::COND_L:
	case X86::COND_LE:
	return false;
	}
	}

	static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
	switch (SetCCOpcode) {
	default: llvm_unreachable("Invalid integer condition!");
	case ISD::SETEQ: return X86::COND_E;
	case ISD::SETGT: return X86::COND_G;
	case ISD::SETGE: return X86::COND_GE;
	case ISD::SETLT: return X86::COND_L;
	case ISD::SETLE: return X86::COND_LE;
	case ISD::SETNE: return X86::COND_NE;
	case ISD::SETULT: return X86::COND_B;
	case ISD::SETUGT: return X86::COND_A;
	case ISD::SETULE: return X86::COND_BE;
	case ISD::SETUGE: return X86::COND_AE;
	}
	}

	/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
	/// condition code, returning the condition code and the LHS/RHS of the
	/// comparison to make.
	static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
	bool isFP, SDValue &LHS, SDValue &RHS,
	SelectionDAG &DAG) {
	if (!isFP) {
	if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
	if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
	// X > -1 -> X == 0, jump !sign.
	RHS = DAG.getConstant(0, DL, RHS.getValueType());
	return X86::COND_NS;
	}
	if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
	// X < 0 -> X == 0, jump on sign.
	return X86::COND_S;
	}
	if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
	// X < 1 -> X <= 0
	RHS = DAG.getConstant(0, DL, RHS.getValueType());
	return X86::COND_LE;
	}
	}

	return TranslateIntegerX86CC(SetCCOpcode);
	}

	// First determine if it is required or is profitable to flip the operands.

	// If LHS is a foldable load, but RHS is not, flip the condition.
	if (ISD::isNON_EXTLoad(LHS.getNode()) &&
	!ISD::isNON_EXTLoad(RHS.getNode())) {
	SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
	std::swap(LHS, RHS);
	}

	switch (SetCCOpcode) {
	default: break;
	case ISD::SETOLT:
	case ISD::SETOLE:
	case ISD::SETUGT:
	case ISD::SETUGE:
	std::swap(LHS, RHS);
	break;
	}

	// On a floating point condition, the flags are set as follows:
	// ZF PF CF op
	// 0 \| 0 \| 0 \| X > Y
	// 0 \| 0 \| 1 \| X < Y
	// 1 \| 0 \| 0 \| X == Y
	// 1 \| 1 \| 1 \| unordered
	switch (SetCCOpcode) {
	default: llvm_unreachable("Condcode should be pre-legalized away");
	case ISD::SETUEQ:
	case ISD::SETEQ: return X86::COND_E;
	case ISD::SETOLT: // flipped
	case ISD::SETOGT:
	case ISD::SETGT: return X86::COND_A;
	case ISD::SETOLE: // flipped
	case ISD::SETOGE:
	case ISD::SETGE: return X86::COND_AE;
	case ISD::SETUGT: // flipped
	case ISD::SETULT:
	case ISD::SETLT: return X86::COND_B;
	case ISD::SETUGE: // flipped
	case ISD::SETULE:
	case ISD::SETLE: return X86::COND_BE;
	case ISD::SETONE:
	case ISD::SETNE: return X86::COND_NE;
	case ISD::SETUO: return X86::COND_P;
	case ISD::SETO: return X86::COND_NP;
	case ISD::SETOEQ:
	case ISD::SETUNE: return X86::COND_INVALID;
	}
	}

	/// Is there a floating point cmov for the specific X86 condition code?
	/// Current x86 isa includes the following FP cmov instructions:
	/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
	static bool hasFPCMov(unsigned X86CC) {
	switch (X86CC) {
	default:
	return false;
	case X86::COND_B:
	case X86::COND_BE:
	case X86::COND_E:
	case X86::COND_P:
	case X86::COND_A:
	case X86::COND_AE:
	case X86::COND_NE:
	case X86::COND_NP:
	return true;
	}
	}


	bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
	const CallInst &I,
	unsigned Intrinsic) const {

	const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
	if (!IntrData)
	return false;

	Info.opc = ISD::INTRINSIC_W_CHAIN;
	Info.readMem = false;
	Info.writeMem = false;
	Info.vol = false;
	Info.offset = 0;

	switch (IntrData->Type) {
	case EXPAND_FROM_MEM: {
	Info.ptrVal = I.getArgOperand(0);
	Info.memVT = MVT::getVT(I.getType());
	Info.align = 1;
	Info.readMem = true;
	break;
	}
	case COMPRESS_TO_MEM: {
	Info.ptrVal = I.getArgOperand(0);
	Info.memVT = MVT::getVT(I.getArgOperand(1)->getType());
	Info.align = 1;
	Info.writeMem = true;
	break;
	}
	case TRUNCATE_TO_MEM_VI8:
	case TRUNCATE_TO_MEM_VI16:
	case TRUNCATE_TO_MEM_VI32: {
	Info.ptrVal = I.getArgOperand(0);
	MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
	MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
	if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
	ScalarVT = MVT::i8;
	else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
	ScalarVT = MVT::i16;
	else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
	ScalarVT = MVT::i32;

	Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
	Info.align = 1;
	Info.writeMem = true;
	break;
	}
	default:
	return false;
	}

	return true;
	}

	/// Returns true if the target can instruction select the
	/// specified FP immediate natively. If false, the legalizer will
	/// materialize the FP immediate as a load from a constant pool.
	bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
	for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
	if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
	return true;
	}
	return false;
	}

	bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
	ISD::LoadExtType ExtTy,
	EVT NewVT) const {
	// "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
	// relocation target a movq or addq instruction: don't let the load shrink.
	SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
	if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
	if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
	return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
	return true;
	}

	/// \brief Returns true if it is beneficial to convert a load of a constant
	/// to just the constant itself.
	bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
	Type *Ty) const {
	assert(Ty->isIntegerTy());

	unsigned BitSize = Ty->getPrimitiveSizeInBits();
	if (BitSize == 0 \|\| BitSize > 64)
	return false;
	return true;
	}

	bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
	unsigned Index) const {
	if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
	return false;

	return (Index == 0 \|\| Index == ResVT.getVectorNumElements());
	}

	bool X86TargetLowering::isCheapToSpeculateCttz() const {
	// Speculate cttz only if we can directly use TZCNT.
	return Subtarget.hasBMI();
	}

	bool X86TargetLowering::isCheapToSpeculateCtlz() const {
	// Speculate ctlz only if we can directly use LZCNT.
	return Subtarget.hasLZCNT();
	}

	bool X86TargetLowering::isCtlzFast() const {
	return Subtarget.hasFastLZCNT();
	}

	bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
	const Instruction &AndI) const {
	return true;
	}

	bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
	if (!Subtarget.hasBMI())
	return false;

	// There are only 32-bit and 64-bit forms for 'andn'.
	EVT VT = Y.getValueType();
	if (VT != MVT::i32 && VT != MVT::i64)
	return false;

	return true;
	}

	MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
	MVT VT = MVT::getIntegerVT(NumBits);
	if (isTypeLegal(VT))
	return VT;

	// PMOVMSKB can handle this.
	if (NumBits == 128 && isTypeLegal(MVT::v16i8))
	return MVT::v16i8;

	// VPMOVMSKB can handle this.
	if (NumBits == 256 && isTypeLegal(MVT::v32i8))
	return MVT::v32i8;

	// TODO: Allow 64-bit type for 32-bit target.
	// TODO: 512-bit types should be allowed, but make sure that those
	// cases are handled in combineVectorSizedSetCCEquality().

	return MVT::INVALID_SIMPLE_VALUE_TYPE;
	}

	/// Val is the undef sentinel value or equal to the specified value.
	static bool isUndefOrEqual(int Val, int CmpVal) {
	return ((Val == SM_SentinelUndef) \|\| (Val == CmpVal));
	}

	/// Val is either the undef or zero sentinel value.
	static bool isUndefOrZero(int Val) {
	return ((Val == SM_SentinelUndef) \|\| (Val == SM_SentinelZero));
	}

	/// Return true if every element in Mask, beginning
	/// from position Pos and ending in Pos+Size is the undef sentinel value.
	static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
	for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
	if (Mask[i] != SM_SentinelUndef)
	return false;
	return true;
	}

	/// Return true if Val is undef or if its value falls within the
	/// specified range (L, H].
	static bool isUndefOrInRange(int Val, int Low, int Hi) {
	return (Val == SM_SentinelUndef) \|\| (Val >= Low && Val < Hi);
	}

	/// Return true if every element in Mask is undef or if its value
	/// falls within the specified range (L, H].
	static bool isUndefOrInRange(ArrayRef<int> Mask,
	int Low, int Hi) {
	for (int M : Mask)
	if (!isUndefOrInRange(M, Low, Hi))
	return false;
	return true;
	}

	/// Return true if Val is undef, zero or if its value falls within the
	/// specified range (L, H].
	static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
	return isUndefOrZero(Val) \|\| (Val >= Low && Val < Hi);
	}

	/// Return true if every element in Mask is undef, zero or if its value
	/// falls within the specified range (L, H].
	static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
	for (int M : Mask)
	if (!isUndefOrZeroOrInRange(M, Low, Hi))
	return false;
	return true;
	}

	/// Return true if every element in Mask, beginning
	/// from position Pos and ending in Pos+Size, falls within the specified
	/// sequential range (Low, Low+Size]. or is undef.
	static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
	unsigned Pos, unsigned Size, int Low) {
	for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
	if (!isUndefOrEqual(Mask[i], Low))
	return false;
	return true;
	}

	/// Return true if every element in Mask, beginning
	/// from position Pos and ending in Pos+Size, falls within the specified
	/// sequential range (Low, Low+Size], or is undef or is zero.
	static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
	unsigned Size, int Low) {
	for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low)
	if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
	return false;
	return true;
	}

	/// Return true if every element in Mask, beginning
	/// from position Pos and ending in Pos+Size is undef or is zero.
	static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
	unsigned Size) {
	for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
	if (!isUndefOrZero(Mask[i]))
	return false;
	return true;
	}

	/// \brief Helper function to test whether a shuffle mask could be
	/// simplified by widening the elements being shuffled.
	///
	/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
	/// leaves it in an unspecified state.
	///
	/// NOTE: This must handle normal vector shuffle masks and target vector
	/// shuffle masks. The latter have the special property of a '-2' representing
	/// a zero-ed lane of a vector.
	static bool canWidenShuffleElements(ArrayRef<int> Mask,
	SmallVectorImpl<int> &WidenedMask) {
	WidenedMask.assign(Mask.size() / 2, 0);
	for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
	int M0 = Mask[i];
	int M1 = Mask[i + 1];

	// If both elements are undef, its trivial.
	if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
	WidenedMask[i / 2] = SM_SentinelUndef;
	continue;
	}

	// Check for an undef mask and a mask value properly aligned to fit with
	// a pair of values. If we find such a case, use the non-undef mask's value.
	if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
	WidenedMask[i / 2] = M1 / 2;
	continue;
	}
	if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
	WidenedMask[i / 2] = M0 / 2;
	continue;
	}

	// When zeroing, we need to spread the zeroing across both lanes to widen.
	if (M0 == SM_SentinelZero \|\| M1 == SM_SentinelZero) {
	if ((M0 == SM_SentinelZero \|\| M0 == SM_SentinelUndef) &&
	(M1 == SM_SentinelZero \|\| M1 == SM_SentinelUndef)) {
	WidenedMask[i / 2] = SM_SentinelZero;
	continue;
	}
	return false;
	}

	// Finally check if the two mask values are adjacent and aligned with
	// a pair.
	if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
	WidenedMask[i / 2] = M0 / 2;
	continue;
	}

	// Otherwise we can't safely widen the elements used in this shuffle.
	return false;
	}
	assert(WidenedMask.size() == Mask.size() / 2 &&
	"Incorrect size of mask after widening the elements!");

	return true;
	}

	/// Helper function to scale a shuffle or target shuffle mask, replacing each
	/// mask index with the scaled sequential indices for an equivalent narrowed
	/// mask. This is the reverse process to canWidenShuffleElements, but can always
	/// succeed.
	static void scaleShuffleMask(int Scale, ArrayRef<int> Mask,
	SmallVectorImpl<int> &ScaledMask) {
	assert(0 < Scale && "Unexpected scaling factor");
	int NumElts = Mask.size();
	ScaledMask.assign(static_cast<size_t>(NumElts * Scale), -1);

	for (int i = 0; i != NumElts; ++i) {
	int M = Mask[i];

	// Repeat sentinel values in every mask element.
	if (M < 0) {
	for (int s = 0; s != Scale; ++s)
	ScaledMask[(Scale * i) + s] = M;
	continue;
	}

	// Scale mask element and increment across each mask element.
	for (int s = 0; s != Scale; ++s)
	ScaledMask[(Scale * i) + s] = (Scale * M) + s;
	}
	}

	/// Return true if the specified EXTRACT_SUBVECTOR operand specifies a vector
	/// extract that is suitable for instruction that extract 128 or 256 bit vectors
	static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
	assert((vecWidth == 128 \|\| vecWidth == 256) && "Unexpected vector width");
	if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
	return false;

	// The index should be aligned on a vecWidth-bit boundary.
	uint64_t Index = N->getConstantOperandVal(1);
	MVT VT = N->getSimpleValueType(0);
	unsigned ElSize = VT.getScalarSizeInBits();
	return (Index * ElSize) % vecWidth == 0;
	}

	/// Return true if the specified INSERT_SUBVECTOR
	/// operand specifies a subvector insert that is suitable for input to
	/// insertion of 128 or 256-bit subvectors
	static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
	assert((vecWidth == 128 \|\| vecWidth == 256) && "Unexpected vector width");
	if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
	return false;

	// The index should be aligned on a vecWidth-bit boundary.
	uint64_t Index = N->getConstantOperandVal(2);
	MVT VT = N->getSimpleValueType(0);
	unsigned ElSize = VT.getScalarSizeInBits();
	return (Index * ElSize) % vecWidth == 0;
	}

	bool X86::isVINSERT128Index(SDNode *N) {
	return isVINSERTIndex(N, 128);
	}

	bool X86::isVINSERT256Index(SDNode *N) {
	return isVINSERTIndex(N, 256);
	}

	bool X86::isVEXTRACT128Index(SDNode *N) {
	return isVEXTRACTIndex(N, 128);
	}

	bool X86::isVEXTRACT256Index(SDNode *N) {
	return isVEXTRACTIndex(N, 256);
	}

	static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
	assert((vecWidth == 128 \|\| vecWidth == 256) && "Unsupported vector width");
	assert(isa<ConstantSDNode>(N->getOperand(1).getNode()) &&
	"Illegal extract subvector for VEXTRACT");

	uint64_t Index = N->getConstantOperandVal(1);
	MVT VecVT = N->getOperand(0).getSimpleValueType();
	unsigned NumElemsPerChunk = vecWidth / VecVT.getScalarSizeInBits();
	return Index / NumElemsPerChunk;
	}

	static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
	assert((vecWidth == 128 \|\| vecWidth == 256) && "Unsupported vector width");
	assert(isa<ConstantSDNode>(N->getOperand(2).getNode()) &&
	"Illegal insert subvector for VINSERT");

	uint64_t Index = N->getConstantOperandVal(2);
	MVT VecVT = N->getSimpleValueType(0);
	unsigned NumElemsPerChunk = vecWidth / VecVT.getScalarSizeInBits();
	return Index / NumElemsPerChunk;
	}

	/// Return the appropriate immediate to extract the specified
	/// EXTRACT_SUBVECTOR index with VEXTRACTF128 and VINSERTI128 instructions.
	unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
	return getExtractVEXTRACTImmediate(N, 128);
	}

	/// Return the appropriate immediate to extract the specified
	/// EXTRACT_SUBVECTOR index with VEXTRACTF64x4 and VINSERTI64x4 instructions.
	unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
	return getExtractVEXTRACTImmediate(N, 256);
	}

	/// Return the appropriate immediate to insert at the specified
	/// INSERT_SUBVECTOR index with VINSERTF128 and VINSERTI128 instructions.
	unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
	return getInsertVINSERTImmediate(N, 128);
	}

	/// Return the appropriate immediate to insert at the specified
	/// INSERT_SUBVECTOR index with VINSERTF46x4 and VINSERTI64x4 instructions.
	unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
	return getInsertVINSERTImmediate(N, 256);
	}

	/// Returns true if Elt is a constant zero or a floating point constant +0.0.
	bool X86::isZeroNode(SDValue Elt) {
	return isNullConstant(Elt) \|\| isNullFPConstant(Elt);
	}

	// Build a vector of constants.
	// Use an UNDEF node if MaskElt == -1.
	// Split 64-bit constants in the 32-bit mode.
	static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
	const SDLoc &dl, bool IsMask = false) {

	SmallVector<SDValue, 32> Ops;
	bool Split = false;

	MVT ConstVecVT = VT;
	unsigned NumElts = VT.getVectorNumElements();
	bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
	if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
	ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
	Split = true;
	}

	MVT EltVT = ConstVecVT.getVectorElementType();
	for (unsigned i = 0; i < NumElts; ++i) {
	bool IsUndef = Values[i] < 0 && IsMask;
	SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
	DAG.getConstant(Values[i], dl, EltVT);
	Ops.push_back(OpNode);
	if (Split)
	Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
	DAG.getConstant(0, dl, EltVT));
	}
	SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
	if (Split)
	ConstsNode = DAG.getBitcast(VT, ConstsNode);
	return ConstsNode;
	}

	static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
	MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
	assert(Bits.size() == Undefs.getBitWidth() &&
	"Unequal constant and undef arrays");
	SmallVector<SDValue, 32> Ops;
	bool Split = false;

	MVT ConstVecVT = VT;
	unsigned NumElts = VT.getVectorNumElements();
	bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
	if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
	ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
	Split = true;
	}

	MVT EltVT = ConstVecVT.getVectorElementType();
	for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
	if (Undefs[i]) {
	Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
	continue;
	}
	const APInt &V = Bits[i];
	assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
	if (Split) {
	Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
	Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
	} else if (EltVT == MVT::f32) {
	APFloat FV(APFloat::IEEEsingle(), V);
	Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
	} else if (EltVT == MVT::f64) {
	APFloat FV(APFloat::IEEEdouble(), V);
	Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
	} else {
	Ops.push_back(DAG.getConstant(V, dl, EltVT));
	}
	}

	SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
	return DAG.getBitcast(VT, ConstsNode);
	}

	/// Returns a vector of specified type with all zero elements.
	static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
	SelectionDAG &DAG, const SDLoc &dl) {
	assert((VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector() \|\|
	VT.getVectorElementType() == MVT::i1) &&
	"Unexpected vector type");

	// Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
	// type. This ensures they get CSE'd. But if the integer type is not
	// available, use a floating-point +0.0 instead.
	SDValue Vec;
	if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
	Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
	} else if (VT.getVectorElementType() == MVT::i1) {
	assert((Subtarget.hasBWI() \|\| VT.getVectorNumElements() <= 16) &&
	"Unexpected vector type");
	assert((Subtarget.hasVLX() \|\| VT.getVectorNumElements() >= 8) &&
	"Unexpected vector type");
	Vec = DAG.getConstant(0, dl, VT);
	} else {
	unsigned Num32BitElts = VT.getSizeInBits() / 32;
	Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
	}
	return DAG.getBitcast(VT, Vec);
	}

	static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
	const SDLoc &dl, unsigned vectorWidth) {
	EVT VT = Vec.getValueType();
	EVT ElVT = VT.getVectorElementType();
	unsigned Factor = VT.getSizeInBits()/vectorWidth;
	EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
	VT.getVectorNumElements()/Factor);

	// Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
	unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
	assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");

	// This is the index of the first element of the vectorWidth-bit chunk
	// we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
	IdxVal &= ~(ElemsPerChunk - 1);

	// If the input is a buildvector just emit a smaller one.
	if (Vec.getOpcode() == ISD::BUILD_VECTOR)
	return DAG.getBuildVector(
	ResultVT, dl, makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk));

	SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
	}

	/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
	/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
	/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
	/// instructions or a simple subregister reference. Idx is an index in the
	/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
	/// lowering EXTRACT_VECTOR_ELT operations easier.
	static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
	SelectionDAG &DAG, const SDLoc &dl) {
	assert((Vec.getValueType().is256BitVector() \|\|
	Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
	return extractSubVector(Vec, IdxVal, DAG, dl, 128);
	}

	/// Generate a DAG to grab 256-bits from a 512-bit vector.
	static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
	SelectionDAG &DAG, const SDLoc &dl) {
	assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
	return extractSubVector(Vec, IdxVal, DAG, dl, 256);
	}

	static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
	SelectionDAG &DAG, const SDLoc &dl,
	unsigned vectorWidth) {
	assert((vectorWidth == 128 \|\| vectorWidth == 256) &&
	"Unsupported vector width");
	// Inserting UNDEF is Result
	if (Vec.isUndef())
	return Result;
	EVT VT = Vec.getValueType();
	EVT ElVT = VT.getVectorElementType();
	EVT ResultVT = Result.getValueType();

	// Insert the relevant vectorWidth bits.
	unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
	assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");

	// This is the index of the first element of the vectorWidth-bit chunk
	// we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
	IdxVal &= ~(ElemsPerChunk - 1);

	SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
	}

	/// Generate a DAG to put 128-bits into a vector > 128 bits. This
	/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
	/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
	/// simple superregister reference. Idx is an index in the 128 bits
	/// we want. It need not be aligned to a 128-bit boundary. That makes
	/// lowering INSERT_VECTOR_ELT operations easier.
	static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
	SelectionDAG &DAG, const SDLoc &dl) {
	assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
	return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
	}

	static SDValue insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
	SelectionDAG &DAG, const SDLoc &dl) {
	assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
	return insertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
	}

	// Return true if the instruction zeroes the unused upper part of the
	// destination and accepts mask.
	static bool isMaskedZeroUpperBitsvXi1(unsigned int Opcode) {
	switch (Opcode) {
	default:
	return false;
	case X86ISD::PCMPEQM:
	case X86ISD::PCMPGTM:
	case X86ISD::CMPM:
	case X86ISD::CMPMU:
	return true;
	}
	}

	/// Insert i1-subvector to i1-vector.
	static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {

	SDLoc dl(Op);
	SDValue Vec = Op.getOperand(0);
	SDValue SubVec = Op.getOperand(1);
	SDValue Idx = Op.getOperand(2);

	if (!isa<ConstantSDNode>(Idx))
	return SDValue();

	unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
	if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
	return Op;

	MVT OpVT = Op.getSimpleValueType();
	MVT SubVecVT = SubVec.getSimpleValueType();
	unsigned NumElems = OpVT.getVectorNumElements();
	unsigned SubVecNumElems = SubVecVT.getVectorNumElements();

	assert(IdxVal + SubVecNumElems <= NumElems &&
	IdxVal % SubVecVT.getSizeInBits() == 0 &&
	"Unexpected index value in INSERT_SUBVECTOR");

	// There are 3 possible cases:
	// 1. Subvector should be inserted in the lower part (IdxVal == 0)
	// 2. Subvector should be inserted in the upper part
	// (IdxVal + SubVecNumElems == NumElems)
	// 3. Subvector should be inserted in the middle (for example v2i1
	// to v16i1, index 2)

	// If this node widens - by concatenating zeroes - the type of the result
	// of a node with instruction that zeroes all upper (irrelevant) bits of the
	// output register, mark this node as legal to enable replacing them with
	// the v8i1 version of the previous instruction during instruction selection.
	// For example, VPCMPEQDZ128rr instruction stores its v4i1 result in a k-reg,
	// while zeroing all the upper remaining 60 bits of the register. if the
	// result of such instruction is inserted into an allZeroVector, then we can
	// safely remove insert_vector (in instruction selection) as the cmp instr
	// already zeroed the rest of the register.
	if (ISD::isBuildVectorAllZeros(Vec.getNode()) && IdxVal == 0 &&
	(isMaskedZeroUpperBitsvXi1(SubVec.getOpcode()) \|\|
	(SubVec.getOpcode() == ISD::AND &&
	(isMaskedZeroUpperBitsvXi1(SubVec.getOperand(0).getOpcode()) \|\|
	isMaskedZeroUpperBitsvXi1(SubVec.getOperand(1).getOpcode())))))
	return Op;

	// extend to natively supported kshift
	MVT MinVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
	MVT WideOpVT = OpVT;
	if (OpVT.getSizeInBits() < MinVT.getStoreSizeInBits())
	WideOpVT = MinVT;

	SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
	SDValue Undef = DAG.getUNDEF(WideOpVT);
	SDValue WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
	Undef, SubVec, ZeroIdx);

	// Extract sub-vector if require.
	auto ExtractSubVec = [&](SDValue V) {
	return (WideOpVT == OpVT) ? V : DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl,
	OpVT, V, ZeroIdx);
	};

	if (Vec.isUndef()) {
	if (IdxVal != 0) {
	SDValue ShiftBits = DAG.getConstant(IdxVal, dl, MVT::i8);
	WideSubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
	ShiftBits);
	}
	return ExtractSubVec(WideSubVec);
	}

	if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
	NumElems = WideOpVT.getVectorNumElements();
	unsigned ShiftLeft = NumElems - SubVecNumElems;
	unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
	Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
	DAG.getConstant(ShiftLeft, dl, MVT::i8));
	Vec = ShiftRight ? DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
	DAG.getConstant(ShiftRight, dl, MVT::i8)) : Vec;
	return ExtractSubVec(Vec);
	}

	if (IdxVal == 0) {
	// Zero lower bits of the Vec
	SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
	Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
	Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
	// Merge them together, SubVec should be zero extended.
	WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
	getZeroVector(WideOpVT, Subtarget, DAG, dl),
	SubVec, ZeroIdx);
	Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
	return ExtractSubVec(Vec);
	}

	// Simple case when we put subvector in the upper part
	if (IdxVal + SubVecNumElems == NumElems) {
	// Zero upper bits of the Vec
	WideSubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
	DAG.getConstant(IdxVal, dl, MVT::i8));
	SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
	Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
	Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
	Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
	return ExtractSubVec(Vec);
	}
	// Subvector should be inserted in the middle - use shuffle
	WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef,
	SubVec, ZeroIdx);
	SmallVector<int, 64> Mask;
	for (unsigned i = 0; i < NumElems; ++i)
	Mask.push_back(i >= IdxVal && i < IdxVal + SubVecNumElems ?
	i : i + NumElems);
	return DAG.getVectorShuffle(OpVT, dl, WideSubVec, Vec, Mask);
	}

	/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
	/// instructions. This is used because creating CONCAT_VECTOR nodes of
	/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
	/// large BUILD_VECTORS.
	static SDValue concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
	unsigned NumElems, SelectionDAG &DAG,
	const SDLoc &dl) {
	SDValue V = insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
	return insert128BitVector(V, V2, NumElems / 2, DAG, dl);
	}

	static SDValue concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
	unsigned NumElems, SelectionDAG &DAG,
	const SDLoc &dl) {
	SDValue V = insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
	return insert256BitVector(V, V2, NumElems / 2, DAG, dl);
	}

	/// Returns a vector of specified type with all bits set.
	/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
	/// Then bitcast to their original type, ensuring they get CSE'd.
	static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
	assert((VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector()) &&
	"Expected a 128/256/512-bit vector type");

	APInt Ones = APInt::getAllOnesValue(32);
	unsigned NumElts = VT.getSizeInBits() / 32;
	SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
	return DAG.getBitcast(VT, Vec);
	}

	static SDValue getExtendInVec(unsigned Opc, const SDLoc &DL, EVT VT, SDValue In,
	SelectionDAG &DAG) {
	EVT InVT = In.getValueType();
	assert((X86ISD::VSEXT == Opc \|\| X86ISD::VZEXT == Opc) && "Unexpected opcode");

	if (VT.is128BitVector() && InVT.is128BitVector())
	return X86ISD::VSEXT == Opc ? DAG.getSignExtendVectorInReg(In, DL, VT)
	: DAG.getZeroExtendVectorInReg(In, DL, VT);

	// For 256-bit vectors, we only need the lower (128-bit) input half.
	// For 512-bit vectors, we only need the lower input half or quarter.
	if (VT.getSizeInBits() > 128 && InVT.getSizeInBits() > 128) {
	int Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
	In = extractSubVector(In, 0, DAG, DL,
	std::max(128, (int)VT.getSizeInBits() / Scale));
	}

	return DAG.getNode(Opc, DL, VT, In);
	}

	/// Generate unpacklo/unpackhi shuffle mask.
	static void createUnpackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, bool Lo,
	bool Unary) {
	assert(Mask.empty() && "Expected an empty shuffle mask vector");
	int NumElts = VT.getVectorNumElements();
	int NumEltsInLane = 128 / VT.getScalarSizeInBits();

	for (int i = 0; i < NumElts; ++i) {
	unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
	int Pos = (i % NumEltsInLane) / 2 + LaneStart;
	Pos += (Unary ? 0 : NumElts * (i % 2));
	Pos += (Lo ? 0 : NumEltsInLane / 2);
	Mask.push_back(Pos);
	}
	}

	/// Returns a vector_shuffle node for an unpackl operation.
	static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
	SDValue V1, SDValue V2) {
	SmallVector<int, 8> Mask;
	createUnpackShuffleMask(VT, Mask, /* Lo = / true, / Unary = */ false);
	return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
	}

	/// Returns a vector_shuffle node for an unpackh operation.
	static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
	SDValue V1, SDValue V2) {
	SmallVector<int, 8> Mask;
	createUnpackShuffleMask(VT, Mask, /* Lo = / false, / Unary = */ false);
	return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
	}

	/// Return a vector_shuffle of the specified vector of zero or undef vector.
	/// This produces a shuffle where the low element of V2 is swizzled into the
	/// zero/undef vector, landing at element Idx.
	/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
	static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
	bool IsZero,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = V2.getSimpleValueType();
	SDValue V1 = IsZero
	? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
	int NumElems = VT.getVectorNumElements();
	SmallVector<int, 16> MaskVec(NumElems);
	for (int i = 0; i != NumElems; ++i)
	// If this is the insertion idx, put the low elt of V2 here.
	MaskVec[i] = (i == Idx) ? NumElems : i;
	return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
	}

	static SDValue peekThroughBitcasts(SDValue V) {
	while (V.getNode() && V.getOpcode() == ISD::BITCAST)
	V = V.getOperand(0);
	return V;
	}

	static SDValue peekThroughOneUseBitcasts(SDValue V) {
	while (V.getNode() && V.getOpcode() == ISD::BITCAST &&
	V.getOperand(0).hasOneUse())
	V = V.getOperand(0);
	return V;
	}

	static const Constant *getTargetConstantFromNode(SDValue Op) {
	Op = peekThroughBitcasts(Op);

	auto *Load = dyn_cast<LoadSDNode>(Op);
	if (!Load)
	return nullptr;

	SDValue Ptr = Load->getBasePtr();
	if (Ptr->getOpcode() == X86ISD::Wrapper \|\|
	Ptr->getOpcode() == X86ISD::WrapperRIP)
	Ptr = Ptr->getOperand(0);

	auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
	if (!CNode \|\| CNode->isMachineConstantPoolEntry())
	return nullptr;

	return dyn_cast<Constant>(CNode->getConstVal());
	}

	// Extract raw constant bits from constant pools.
	static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
	APInt &UndefElts,
	SmallVectorImpl<APInt> &EltBits,
	bool AllowWholeUndefs = true,
	bool AllowPartialUndefs = true) {
	assert(EltBits.empty() && "Expected an empty EltBits vector");

	Op = peekThroughBitcasts(Op);

	EVT VT = Op.getValueType();
	unsigned SizeInBits = VT.getSizeInBits();
	assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
	unsigned NumElts = SizeInBits / EltSizeInBits;

	// Bitcast a source array of element bits to the target size.
	auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
	unsigned NumSrcElts = UndefSrcElts.getBitWidth();
	unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
	assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
	"Constant bit sizes don't match");

	// Don't split if we don't allow undef bits.
	bool AllowUndefs = AllowWholeUndefs \|\| AllowPartialUndefs;
	if (UndefSrcElts.getBoolValue() && !AllowUndefs)
	return false;

	// If we're already the right size, don't bother bitcasting.
	if (NumSrcElts == NumElts) {
	UndefElts = UndefSrcElts;
	EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
	return true;
	}

	// Extract all the undef/constant element data and pack into single bitsets.
	APInt UndefBits(SizeInBits, 0);
	APInt MaskBits(SizeInBits, 0);

	for (unsigned i = 0; i != NumSrcElts; ++i) {
	unsigned BitOffset = i * SrcEltSizeInBits;
	if (UndefSrcElts[i])
	UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
	MaskBits.insertBits(SrcEltBits[i], BitOffset);
	}

	// Split the undef/constant single bitset data into the target elements.
	UndefElts = APInt(NumElts, 0);
	EltBits.resize(NumElts, APInt(EltSizeInBits, 0));

	for (unsigned i = 0; i != NumElts; ++i) {
	unsigned BitOffset = i * EltSizeInBits;
	APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);

	// Only treat an element as UNDEF if all bits are UNDEF.
	if (UndefEltBits.isAllOnesValue()) {
	if (!AllowWholeUndefs)
	return false;
	UndefElts.setBit(i);
	continue;
	}

	// If only some bits are UNDEF then treat them as zero (or bail if not
	// supported).
	if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
	return false;

	APInt Bits = MaskBits.extractBits(EltSizeInBits, BitOffset);
	EltBits[i] = Bits.getZExtValue();
	}
	return true;
	};

	// Collect constant bits and insert into mask/undef bit masks.
	auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
	unsigned UndefBitIndex) {
	if (!Cst)
	return false;
	if (isa<UndefValue>(Cst)) {
	Undefs.setBit(UndefBitIndex);
	return true;
	}
	if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
	Mask = CInt->getValue();
	return true;
	}
	if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
	Mask = CFP->getValueAPF().bitcastToAPInt();
	return true;
	}
	return false;
	};

	// Extract constant bits from build vector.
	if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
	unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
	unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

	APInt UndefSrcElts(NumSrcElts, 0);
	SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
	for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
	const SDValue &Src = Op.getOperand(i);
	if (Src.isUndef()) {
	UndefSrcElts.setBit(i);
	continue;
	}
	auto *Cst = cast<ConstantSDNode>(Src);
	SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
	}
	return CastBitData(UndefSrcElts, SrcEltBits);
	}

	// Extract constant bits from constant pool vector.
	if (auto *Cst = getTargetConstantFromNode(Op)) {
	Type *CstTy = Cst->getType();
	if (!CstTy->isVectorTy() \|\| (SizeInBits != CstTy->getPrimitiveSizeInBits()))
	return false;

	unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
	unsigned NumSrcElts = CstTy->getVectorNumElements();

	APInt UndefSrcElts(NumSrcElts, 0);
	SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
	for (unsigned i = 0; i != NumSrcElts; ++i)
	if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
	UndefSrcElts, i))
	return false;

	return CastBitData(UndefSrcElts, SrcEltBits);
	}

	// Extract constant bits from a broadcasted constant pool scalar.
	if (Op.getOpcode() == X86ISD::VBROADCAST &&
	EltSizeInBits <= VT.getScalarSizeInBits()) {
	if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
	unsigned SrcEltSizeInBits = Broadcast->getType()->getScalarSizeInBits();
	unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

	APInt UndefSrcElts(NumSrcElts, 0);
	SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
	if (CollectConstantBits(Broadcast, SrcEltBits[0], UndefSrcElts, 0)) {
	if (UndefSrcElts[0])
	UndefSrcElts.setBits(0, NumSrcElts);
	SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
	return CastBitData(UndefSrcElts, SrcEltBits);
	}
	}
	}

	// Extract a rematerialized scalar constant insertion.
	if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
	Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
	isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
	unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
	unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

	APInt UndefSrcElts(NumSrcElts, 0);
	SmallVector<APInt, 64> SrcEltBits;
	auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
	SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
	SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
	return CastBitData(UndefSrcElts, SrcEltBits);
	}

	return false;
	}

	static bool getTargetShuffleMaskIndices(SDValue MaskNode,
	unsigned MaskEltSizeInBits,
	SmallVectorImpl<uint64_t> &RawMask) {
	APInt UndefElts;
	SmallVector<APInt, 64> EltBits;

	// Extract the raw target constant bits.
	// FIXME: We currently don't support UNDEF bits or mask entries.
	if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
	EltBits, /* AllowWholeUndefs */ false,
	/* AllowPartialUndefs */ false))
	return false;

	// Insert the extracted elements into the mask.
	for (APInt Elt : EltBits)
	RawMask.push_back(Elt.getZExtValue());

	return true;
	}

	/// Calculates the shuffle mask corresponding to the target-specific opcode.
	/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
	/// operands in \p Ops, and returns true.
	/// Sets \p IsUnary to true if only one source is used. Note that this will set
	/// IsUnary for shuffles which use a single input multiple times, and in those
	/// cases it will adjust the mask to only have indices within that single input.
	/// It is an error to call this with non-empty Mask/Ops vectors.
	static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
	SmallVectorImpl<SDValue> &Ops,
	SmallVectorImpl<int> &Mask, bool &IsUnary) {
	unsigned NumElems = VT.getVectorNumElements();
	SDValue ImmN;

	assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
	assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");

	IsUnary = false;
	bool IsFakeUnary = false;
	switch(N->getOpcode()) {
	case X86ISD::BLENDI:
	ImmN = N->getOperand(N->getNumOperands()-1);
	DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::SHUFP:
	ImmN = N->getOperand(N->getNumOperands()-1);
	DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::INSERTPS:
	ImmN = N->getOperand(N->getNumOperands()-1);
	DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::EXTRQI:
	if (isa<ConstantSDNode>(N->getOperand(1)) &&
	isa<ConstantSDNode>(N->getOperand(2))) {
	int BitLen = N->getConstantOperandVal(1);
	int BitIdx = N->getConstantOperandVal(2);
	DecodeEXTRQIMask(VT, BitLen, BitIdx, Mask);
	IsUnary = true;
	}
	break;
	case X86ISD::INSERTQI:
	if (isa<ConstantSDNode>(N->getOperand(2)) &&
	isa<ConstantSDNode>(N->getOperand(3))) {
	int BitLen = N->getConstantOperandVal(2);
	int BitIdx = N->getConstantOperandVal(3);
	DecodeINSERTQIMask(VT, BitLen, BitIdx, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	}
	break;
	case X86ISD::UNPCKH:
	DecodeUNPCKHMask(VT, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::UNPCKL:
	DecodeUNPCKLMask(VT, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::MOVHLPS:
	DecodeMOVHLPSMask(NumElems, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::MOVLHPS:
	DecodeMOVLHPSMask(NumElems, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::PALIGNR:
	assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
	ImmN = N->getOperand(N->getNumOperands()-1);
	DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	Ops.push_back(N->getOperand(1));
	Ops.push_back(N->getOperand(0));
	break;
	case X86ISD::VSHLDQ:
	assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
	ImmN = N->getOperand(N->getNumOperands() - 1);
	DecodePSLLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = true;
	break;
	case X86ISD::VSRLDQ:
	assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
	ImmN = N->getOperand(N->getNumOperands() - 1);
	DecodePSRLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = true;
	break;
	case X86ISD::PSHUFD:
	case X86ISD::VPERMILPI:
	ImmN = N->getOperand(N->getNumOperands()-1);
	DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = true;
	break;
	case X86ISD::PSHUFHW:
	ImmN = N->getOperand(N->getNumOperands()-1);
	DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = true;
	break;
	case X86ISD::PSHUFLW:
	ImmN = N->getOperand(N->getNumOperands()-1);
	DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = true;
	break;
	case X86ISD::VZEXT_MOVL:
	DecodeZeroMoveLowMask(VT, Mask);
	IsUnary = true;
	break;
	case X86ISD::VBROADCAST: {
	SDValue N0 = N->getOperand(0);
	// See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so,
	// add the pre-extracted value to the Ops vector.
	if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	N0.getOperand(0).getValueType() == VT &&
	N0.getConstantOperandVal(1) == 0)
	Ops.push_back(N0.getOperand(0));

	// We only decode broadcasts of same-sized vectors, unless the broadcast
	// came from an extract from the original width. If we found one, we
	// pushed it the Ops vector above.
	if (N0.getValueType() == VT \|\| !Ops.empty()) {
	DecodeVectorBroadcast(VT, Mask);
	IsUnary = true;
	break;
	}
	return false;
	}
	case X86ISD::VPERMILPV: {
	IsUnary = true;
	SDValue MaskNode = N->getOperand(1);
	unsigned MaskEltSize = VT.getScalarSizeInBits();
	SmallVector<uint64_t, 32> RawMask;
	if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
	DecodeVPERMILPMask(VT, RawMask, Mask);
	break;
	}
	if (auto *C = getTargetConstantFromNode(MaskNode)) {
	DecodeVPERMILPMask(C, MaskEltSize, Mask);
	break;
	}
	return false;
	}
	case X86ISD::PSHUFB: {
	IsUnary = true;
	SDValue MaskNode = N->getOperand(1);
	SmallVector<uint64_t, 32> RawMask;
	if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
	DecodePSHUFBMask(RawMask, Mask);
	break;
	}
	if (auto *C = getTargetConstantFromNode(MaskNode)) {
	DecodePSHUFBMask(C, Mask);
	break;
	}
	return false;
	}
	case X86ISD::VPERMI:
	ImmN = N->getOperand(N->getNumOperands()-1);
	DecodeVPERMMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = true;
	break;
	case X86ISD::MOVSS:
	case X86ISD::MOVSD:
	DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
	break;
	case X86ISD::VPERM2X128:
	ImmN = N->getOperand(N->getNumOperands()-1);
	DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::MOVSLDUP:
	DecodeMOVSLDUPMask(VT, Mask);
	IsUnary = true;
	break;
	case X86ISD::MOVSHDUP:
	DecodeMOVSHDUPMask(VT, Mask);
	IsUnary = true;
	break;
	case X86ISD::MOVDDUP:
	DecodeMOVDDUPMask(VT, Mask);
	IsUnary = true;
	break;
	case X86ISD::MOVLHPD:
	case X86ISD::MOVLPD:
	case X86ISD::MOVLPS:
	// Not yet implemented
	return false;
	case X86ISD::VPERMIL2: {
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	unsigned MaskEltSize = VT.getScalarSizeInBits();
	SDValue MaskNode = N->getOperand(2);
	SDValue CtrlNode = N->getOperand(3);
	if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
	unsigned CtrlImm = CtrlOp->getZExtValue();
	SmallVector<uint64_t, 32> RawMask;
	if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
	DecodeVPERMIL2PMask(VT, CtrlImm, RawMask, Mask);
	break;
	}
	if (auto *C = getTargetConstantFromNode(MaskNode)) {
	DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask);
	break;
	}
	}
	return false;
	}
	case X86ISD::VPPERM: {
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	SDValue MaskNode = N->getOperand(2);
	SmallVector<uint64_t, 32> RawMask;
	if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
	DecodeVPPERMMask(RawMask, Mask);
	break;
	}
	if (auto *C = getTargetConstantFromNode(MaskNode)) {
	DecodeVPPERMMask(C, Mask);
	break;
	}
	return false;
	}
	case X86ISD::VPERMV: {
	IsUnary = true;
	// Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
	Ops.push_back(N->getOperand(1));
	SDValue MaskNode = N->getOperand(0);
	SmallVector<uint64_t, 32> RawMask;
	unsigned MaskEltSize = VT.getScalarSizeInBits();
	if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
	DecodeVPERMVMask(RawMask, Mask);
	break;
	}
	if (auto *C = getTargetConstantFromNode(MaskNode)) {
	DecodeVPERMVMask(C, MaskEltSize, Mask);
	break;
	}
	return false;
	}
	case X86ISD::VPERMV3: {
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
	// Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
	Ops.push_back(N->getOperand(0));
	Ops.push_back(N->getOperand(2));
	SDValue MaskNode = N->getOperand(1);
	unsigned MaskEltSize = VT.getScalarSizeInBits();
	if (auto *C = getTargetConstantFromNode(MaskNode)) {
	DecodeVPERMV3Mask(C, MaskEltSize, Mask);
	break;
	}
	return false;
	}
	case X86ISD::VPERMIV3: {
	IsUnary = IsFakeUnary = N->getOperand(1) == N->getOperand(2);
	// Unlike most shuffle nodes, VPERMIV3's mask operand is the first one.
	Ops.push_back(N->getOperand(1));
	Ops.push_back(N->getOperand(2));
	SDValue MaskNode = N->getOperand(0);
	unsigned MaskEltSize = VT.getScalarSizeInBits();
	if (auto *C = getTargetConstantFromNode(MaskNode)) {
	DecodeVPERMV3Mask(C, MaskEltSize, Mask);
	break;
	}
	return false;
	}
	default: llvm_unreachable("unknown target shuffle node");
	}

	// Empty mask indicates the decode failed.
	if (Mask.empty())
	return false;

	// Check if we're getting a shuffle mask with zero'd elements.
	if (!AllowSentinelZero)
	if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
	return false;

	// If we have a fake unary shuffle, the shuffle mask is spread across two
	// inputs that are actually the same node. Re-map the mask to always point
	// into the first input.
	if (IsFakeUnary)
	for (int &M : Mask)
	if (M >= (int)Mask.size())
	M -= Mask.size();

	// If we didn't already add operands in the opcode-specific code, default to
	// adding 1 or 2 operands starting at 0.
	if (Ops.empty()) {
	Ops.push_back(N->getOperand(0));
	if (!IsUnary \|\| IsFakeUnary)
	Ops.push_back(N->getOperand(1));
	}

	return true;
	}

	/// Check a target shuffle mask's inputs to see if we can set any values to
	/// SM_SentinelZero - this is for elements that are known to be zero
	/// (not just zeroable) from their inputs.
	/// Returns true if the target shuffle mask was decoded.
	static bool setTargetShuffleZeroElements(SDValue N,
	SmallVectorImpl<int> &Mask,
	SmallVectorImpl<SDValue> &Ops) {
	bool IsUnary;
	if (!isTargetShuffle(N.getOpcode()))
	return false;

	MVT VT = N.getSimpleValueType();
	if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
	return false;

	SDValue V1 = Ops[0];
	SDValue V2 = IsUnary ? V1 : Ops[1];

	V1 = peekThroughBitcasts(V1);
	V2 = peekThroughBitcasts(V2);

	assert((VT.getSizeInBits() % Mask.size()) == 0 &&
	"Illegal split of shuffle value type");
	unsigned EltSizeInBits = VT.getSizeInBits() / Mask.size();

	// Extract known constant input data.
	APInt UndefSrcElts[2];
	SmallVector<APInt, 32> SrcEltBits[2];
	bool IsSrcConstant[2] = {
	getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
	SrcEltBits[0], true, false),
	getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
	SrcEltBits[1], true, false)};

	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
	int M = Mask[i];

	// Already decoded as SM_SentinelZero / SM_SentinelUndef.
	if (M < 0)
	continue;

	// Determine shuffle input and normalize the mask.
	unsigned SrcIdx = M / Size;
	SDValue V = M < Size ? V1 : V2;
	M %= Size;

	// We are referencing an UNDEF input.
	if (V.isUndef()) {
	Mask[i] = SM_SentinelUndef;
	continue;
	}

	// SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
	// TODO: We currently only set UNDEF for integer types - floats use the same
	// registers as vectors and many of the scalar folded loads rely on the
	// SCALAR_TO_VECTOR pattern.
	if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
	(Size % V.getValueType().getVectorNumElements()) == 0) {
	int Scale = Size / V.getValueType().getVectorNumElements();
	int Idx = M / Scale;
	if (Idx != 0 && !VT.isFloatingPoint())
	Mask[i] = SM_SentinelUndef;
	else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
	Mask[i] = SM_SentinelZero;
	continue;
	}

	// Attempt to extract from the source's constant bits.
	if (IsSrcConstant[SrcIdx]) {
	if (UndefSrcElts[SrcIdx][M])
	Mask[i] = SM_SentinelUndef;
	else if (SrcEltBits[SrcIdx][M] == 0)
	Mask[i] = SM_SentinelZero;
	}
	}

	assert(VT.getVectorNumElements() == Mask.size() &&
	"Different mask size from vector size!");
	return true;
	}

	// Attempt to decode ops that could be represented as a shuffle mask.
	// The decoded shuffle mask may contain a different number of elements to the
	// destination value type.
	static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
	SmallVectorImpl<SDValue> &Ops,
	SelectionDAG &DAG) {
	Mask.clear();
	Ops.clear();

	MVT VT = N.getSimpleValueType();
	unsigned NumElts = VT.getVectorNumElements();
	unsigned NumSizeInBits = VT.getSizeInBits();
	unsigned NumBitsPerElt = VT.getScalarSizeInBits();
	assert((NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 &&
	"Expected byte aligned value types");

	unsigned Opcode = N.getOpcode();
	switch (Opcode) {
	case ISD::AND:
	case X86ISD::ANDNP: {
	// Attempt to decode as a per-byte mask.
	APInt UndefElts;
	SmallVector<APInt, 32> EltBits;
	SDValue N0 = N.getOperand(0);
	SDValue N1 = N.getOperand(1);
	bool IsAndN = (X86ISD::ANDNP == Opcode);
	uint64_t ZeroMask = IsAndN ? 255 : 0;
	if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
	return false;
	for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
	if (UndefElts[i]) {
	Mask.push_back(SM_SentinelUndef);
	continue;
	}
	uint64_t ByteBits = EltBits[i].getZExtValue();
	if (ByteBits != 0 && ByteBits != 255)
	return false;
	Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
	}
	Ops.push_back(IsAndN ? N1 : N0);
	return true;
	}
	case ISD::SCALAR_TO_VECTOR: {
	// Match against a scalar_to_vector of an extract from a vector,
	// for PEXTRW/PEXTRB we must handle the implicit zext of the scalar.
	SDValue N0 = N.getOperand(0);
	SDValue SrcExtract;

	if (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	N0.getOperand(0).getValueType() == VT) {
	SrcExtract = N0;
	} else if (N0.getOpcode() == ISD::AssertZext &&
	N0.getOperand(0).getOpcode() == X86ISD::PEXTRW &&
	cast<VTSDNode>(N0.getOperand(1))->getVT() == MVT::i16) {
	SrcExtract = N0.getOperand(0);
	assert(SrcExtract.getOperand(0).getValueType() == MVT::v8i16);
	} else if (N0.getOpcode() == ISD::AssertZext &&
	N0.getOperand(0).getOpcode() == X86ISD::PEXTRB &&
	cast<VTSDNode>(N0.getOperand(1))->getVT() == MVT::i8) {
	SrcExtract = N0.getOperand(0);
	assert(SrcExtract.getOperand(0).getValueType() == MVT::v16i8);
	}

	if (!SrcExtract \|\| !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
	return false;

	SDValue SrcVec = SrcExtract.getOperand(0);
	EVT SrcVT = SrcVec.getValueType();
	unsigned NumSrcElts = SrcVT.getVectorNumElements();
	unsigned NumZeros = (NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1;

	unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
	if (NumSrcElts <= SrcIdx)
	return false;

	Ops.push_back(SrcVec);
	Mask.push_back(SrcIdx);
	Mask.append(NumZeros, SM_SentinelZero);
	Mask.append(NumSrcElts - Mask.size(), SM_SentinelUndef);
	return true;
	}
	case X86ISD::PINSRB:
	case X86ISD::PINSRW: {
	SDValue InVec = N.getOperand(0);
	SDValue InScl = N.getOperand(1);
	uint64_t InIdx = N.getConstantOperandVal(2);
	assert(InIdx < NumElts && "Illegal insertion index");

	// Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern.
	if (X86::isZeroNode(InScl)) {
	Ops.push_back(InVec);
	for (unsigned i = 0; i != NumElts; ++i)
	Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i);
	return true;
	}

	// Attempt to recognise a PINSR(ASSERTZEXT(PEXTR)) shuffle pattern.
	// TODO: Expand this to support INSERT_VECTOR_ELT/etc.
	unsigned ExOp =
	(X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW);
	if (InScl.getOpcode() != ISD::AssertZext \|\|
	InScl.getOperand(0).getOpcode() != ExOp)
	return false;

	SDValue ExVec = InScl.getOperand(0).getOperand(0);
	uint64_t ExIdx = InScl.getOperand(0).getConstantOperandVal(1);
	assert(ExIdx < NumElts && "Illegal extraction index");
	Ops.push_back(InVec);
	Ops.push_back(ExVec);
	for (unsigned i = 0; i != NumElts; ++i)
	Mask.push_back(i == InIdx ? NumElts + ExIdx : i);
	return true;
	}
	case X86ISD::PACKSS: {
	// If we know input saturation won't happen we can treat this
	// as a truncation shuffle.
	if (DAG.ComputeNumSignBits(N.getOperand(0)) <= NumBitsPerElt \|\|
	DAG.ComputeNumSignBits(N.getOperand(1)) <= NumBitsPerElt)
	return false;

	Ops.push_back(N.getOperand(0));
	Ops.push_back(N.getOperand(1));
	for (unsigned i = 0; i != NumElts; ++i)
	Mask.push_back(i * 2);
	return true;
	}
	case X86ISD::VSHLI:
	case X86ISD::VSRLI: {
	uint64_t ShiftVal = N.getConstantOperandVal(1);
	// Out of range bit shifts are guaranteed to be zero.
	if (NumBitsPerElt <= ShiftVal) {
	Mask.append(NumElts, SM_SentinelZero);
	return true;
	}

	// We can only decode 'whole byte' bit shifts as shuffles.
	if ((ShiftVal % 8) != 0)
	break;

	uint64_t ByteShift = ShiftVal / 8;
	unsigned NumBytes = NumSizeInBits / 8;
	unsigned NumBytesPerElt = NumBitsPerElt / 8;
	Ops.push_back(N.getOperand(0));

	// Clear mask to all zeros and insert the shifted byte indices.
	Mask.append(NumBytes, SM_SentinelZero);

	if (X86ISD::VSHLI == Opcode) {
	for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
	for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
	Mask[i + j] = i + j - ByteShift;
	} else {
	for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
	for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
	Mask[i + j - ByteShift] = i + j;
	}
	return true;
	}
	case ISD::ZERO_EXTEND_VECTOR_INREG:
	case X86ISD::VZEXT: {
	// TODO - add support for VPMOVZX with smaller input vector types.
	SDValue Src = N.getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();
	if (NumSizeInBits != SrcVT.getSizeInBits())
	break;
	DecodeZeroExtendMask(SrcVT.getScalarType(), VT, Mask);
	Ops.push_back(Src);
	return true;
	}
	}

	return false;
	}

	/// Removes unused shuffle source inputs and adjusts the shuffle mask accordingly.
	static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
	SmallVectorImpl<int> &Mask) {
	int MaskWidth = Mask.size();
	SmallVector<SDValue, 16> UsedInputs;
	for (int i = 0, e = Inputs.size(); i < e; ++i) {
	int lo = UsedInputs.size() * MaskWidth;
	int hi = lo + MaskWidth;
	if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
	UsedInputs.push_back(Inputs[i]);
	continue;
	}
	for (int &M : Mask)
	if (lo <= M)
	M -= MaskWidth;
	}
	Inputs = UsedInputs;
	}

	/// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
	/// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
	/// remaining input indices in case we now have a unary shuffle and adjust the
	/// inputs accordingly.
	/// Returns true if the target shuffle mask was decoded.
	static bool resolveTargetShuffleInputs(SDValue Op,
	SmallVectorImpl<SDValue> &Inputs,
	SmallVectorImpl<int> &Mask,
	SelectionDAG &DAG) {
	if (!setTargetShuffleZeroElements(Op, Mask, Inputs))
	if (!getFauxShuffleMask(Op, Mask, Inputs, DAG))
	return false;

	resolveTargetShuffleInputsAndMask(Inputs, Mask);
	return true;
	}

	/// Returns the scalar element that will make up the ith
	/// element of the result of the vector shuffle.
	static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
	unsigned Depth) {
	if (Depth == 6)
	return SDValue(); // Limit search depth.

	SDValue V = SDValue(N, 0);
	EVT VT = V.getValueType();
	unsigned Opcode = V.getOpcode();

	// Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
	if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
	int Elt = SV->getMaskElt(Index);

	if (Elt < 0)
	return DAG.getUNDEF(VT.getVectorElementType());

	unsigned NumElems = VT.getVectorNumElements();
	SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
	: SV->getOperand(1);
	return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
	}

	// Recurse into target specific vector shuffles to find scalars.
	if (isTargetShuffle(Opcode)) {
	MVT ShufVT = V.getSimpleValueType();
	MVT ShufSVT = ShufVT.getVectorElementType();
	int NumElems = (int)ShufVT.getVectorNumElements();
	SmallVector<int, 16> ShuffleMask;
	SmallVector<SDValue, 16> ShuffleOps;
	bool IsUnary;

	if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
	return SDValue();

	int Elt = ShuffleMask[Index];
	if (Elt == SM_SentinelZero)
	return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
	: DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
	if (Elt == SM_SentinelUndef)
	return DAG.getUNDEF(ShufSVT);

	assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");
	SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
	return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
	Depth+1);
	}

	// Actual nodes that may contain scalar elements
	if (Opcode == ISD::BITCAST) {
	V = V.getOperand(0);
	EVT SrcVT = V.getValueType();
	unsigned NumElems = VT.getVectorNumElements();

	if (!SrcVT.isVector() \|\| SrcVT.getVectorNumElements() != NumElems)
	return SDValue();
	}

	if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
	return (Index == 0) ? V.getOperand(0)
	: DAG.getUNDEF(VT.getVectorElementType());

	if (V.getOpcode() == ISD::BUILD_VECTOR)
	return V.getOperand(Index);

	return SDValue();
	}

	/// Custom lower build_vector of v16i8.
	static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
	unsigned NumNonZero, unsigned NumZero,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (NumNonZero > 8 && !Subtarget.hasSSE41())
	return SDValue();

	SDLoc dl(Op);
	SDValue V;
	bool First = true;

	// SSE4.1 - use PINSRB to insert each byte directly.
	if (Subtarget.hasSSE41()) {
	for (unsigned i = 0; i < 16; ++i) {
	bool IsNonZero = (NonZeros & (1 << i)) != 0;
	if (IsNonZero) {
	// If the build vector contains zeros or our first insertion is not the
	// first index then insert into zero vector to break any register
	// dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
	if (First) {
	First = false;
	if (NumZero \|\| 0 != i)
	V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl);
	else {
	assert(0 == i && "Expected insertion into zero-index");
	V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
	V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
	V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
	V = DAG.getBitcast(MVT::v16i8, V);
	continue;
	}
	}
	V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v16i8, V,
	Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
	}
	}

	return V;
	}

	// Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
	for (unsigned i = 0; i < 16; ++i) {
	bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
	if (ThisIsNonZero && First) {
	if (NumZero)
	V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
	else
	V = DAG.getUNDEF(MVT::v8i16);
	First = false;
	}

	if ((i & 1) != 0) {
	// FIXME: Investigate extending to i32 instead of just i16.
	// FIXME: Investigate combining the first 4 bytes as a i32 instead.
	SDValue ThisElt, LastElt;
	bool LastIsNonZero = (NonZeros & (1 << (i - 1))) != 0;
	if (LastIsNonZero) {
	LastElt =
	DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i - 1));
	}
	if (ThisIsNonZero) {
	ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
	ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, ThisElt,
	DAG.getConstant(8, dl, MVT::i8));
	if (LastIsNonZero)
	ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
	} else
	ThisElt = LastElt;

	if (ThisElt) {
	if (1 == i) {
	V = NumZero ? DAG.getZExtOrTrunc(ThisElt, dl, MVT::i32)
	: DAG.getAnyExtOrTrunc(ThisElt, dl, MVT::i32);
	V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
	V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
	V = DAG.getBitcast(MVT::v8i16, V);
	} else {
	V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
	DAG.getIntPtrConstant(i / 2, dl));
	}
	}
	}
	}

	return DAG.getBitcast(MVT::v16i8, V);
	}

	/// Custom lower build_vector of v8i16.
	static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
	unsigned NumNonZero, unsigned NumZero,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (NumNonZero > 4 && !Subtarget.hasSSE41())
	return SDValue();

	SDLoc dl(Op);
	SDValue V;
	bool First = true;
	for (unsigned i = 0; i < 8; ++i) {
	bool IsNonZero = (NonZeros & (1 << i)) != 0;
	if (IsNonZero) {
	// If the build vector contains zeros or our first insertion is not the
	// first index then insert into zero vector to break any register
	// dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
	if (First) {
	First = false;
	if (NumZero \|\| 0 != i)
	V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
	else {
	assert(0 == i && "Expected insertion into zero-index");
	V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
	V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
	V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
	V = DAG.getBitcast(MVT::v8i16, V);
	continue;
	}
	}
	V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V,
	Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
	}
	}

	return V;
	}

	/// Custom lower build_vector of v4i32 or v4f32.
	static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// Find all zeroable elements.
	std::bitset<4> Zeroable;
	for (int i=0; i < 4; ++i) {
	SDValue Elt = Op->getOperand(i);
	Zeroable[i] = (Elt.isUndef() \|\| X86::isZeroNode(Elt));
	}
	assert(Zeroable.size() - Zeroable.count() > 1 &&
	"We expect at least two non-zero elements!");

	// We only know how to deal with build_vector nodes where elements are either
	// zeroable or extract_vector_elt with constant index.
	SDValue FirstNonZero;
	unsigned FirstNonZeroIdx;
	for (unsigned i=0; i < 4; ++i) {
	if (Zeroable[i])
	continue;
	SDValue Elt = Op->getOperand(i);
	if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	!isa<ConstantSDNode>(Elt.getOperand(1)))
	return SDValue();
	// Make sure that this node is extracting from a 128-bit vector.
	MVT VT = Elt.getOperand(0).getSimpleValueType();
	if (!VT.is128BitVector())
	return SDValue();
	if (!FirstNonZero.getNode()) {
	FirstNonZero = Elt;
	FirstNonZeroIdx = i;
	}
	}

	assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
	SDValue V1 = FirstNonZero.getOperand(0);
	MVT VT = V1.getSimpleValueType();

	// See if this build_vector can be lowered as a blend with zero.
	SDValue Elt;
	unsigned EltMaskIdx, EltIdx;
	int Mask[4];
	for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
	if (Zeroable[EltIdx]) {
	// The zero vector will be on the right hand side.
	Mask[EltIdx] = EltIdx+4;
	continue;
	}

	Elt = Op->getOperand(EltIdx);
	// By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
	EltMaskIdx = Elt.getConstantOperandVal(1);
	if (Elt.getOperand(0) != V1 \|\| EltMaskIdx != EltIdx)
	break;
	Mask[EltIdx] = EltIdx;
	}

	if (EltIdx == 4) {
	// Let the shuffle legalizer deal with blend operations.
	SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
	if (V1.getSimpleValueType() != VT)
	V1 = DAG.getBitcast(VT, V1);
	return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask);
	}

	// See if we can lower this build_vector to a INSERTPS.
	if (!Subtarget.hasSSE41())
	return SDValue();

	SDValue V2 = Elt.getOperand(0);
	if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
	V1 = SDValue();

	bool CanFold = true;
	for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
	if (Zeroable[i])
	continue;

	SDValue Current = Op->getOperand(i);
	SDValue SrcVector = Current->getOperand(0);
	if (!V1.getNode())
	V1 = SrcVector;
	CanFold = (SrcVector == V1) && (Current.getConstantOperandVal(1) == i);
	}

	if (!CanFold)
	return SDValue();

	assert(V1.getNode() && "Expected at least two non-zero elements!");
	if (V1.getSimpleValueType() != MVT::v4f32)
	V1 = DAG.getBitcast(MVT::v4f32, V1);
	if (V2.getSimpleValueType() != MVT::v4f32)
	V2 = DAG.getBitcast(MVT::v4f32, V2);

	// Ok, we can emit an INSERTPS instruction.
	unsigned ZMask = Zeroable.to_ulong();

	unsigned InsertPSMask = EltMaskIdx << 6 \| EltIdx << 4 \| ZMask;
	assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
	SDLoc DL(Op);
	SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
	DAG.getIntPtrConstant(InsertPSMask, DL));
	return DAG.getBitcast(VT, Result);
	}

	/// Return a vector logical shift node.
	static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
	SelectionDAG &DAG, const TargetLowering &TLI,
	const SDLoc &dl) {
	assert(VT.is128BitVector() && "Unknown type for VShift");
	MVT ShVT = MVT::v16i8;
	unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
	SrcOp = DAG.getBitcast(ShVT, SrcOp);
	MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT);
	assert(NumBits % 8 == 0 && "Only support byte sized shifts");
	SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy);
	return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
	}

	static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
	SelectionDAG &DAG) {

	// Check if the scalar load can be widened into a vector load. And if
	// the address is "base + cst" see if the cst can be "absorbed" into
	// the shuffle mask.
	if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
	SDValue Ptr = LD->getBasePtr();
	if (!ISD::isNormalLoad(LD) \|\| LD->isVolatile())
	return SDValue();
	EVT PVT = LD->getValueType(0);
	if (PVT != MVT::i32 && PVT != MVT::f32)
	return SDValue();

	int FI = -1;
	int64_t Offset = 0;
	if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
	FI = FINode->getIndex();
	Offset = 0;
	} else if (DAG.isBaseWithConstantOffset(Ptr) &&
	isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
	FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
	Offset = Ptr.getConstantOperandVal(1);
	Ptr = Ptr.getOperand(0);
	} else {
	return SDValue();
	}

	// FIXME: 256-bit vector instructions don't require a strict alignment,
	// improve this code to support it better.
	unsigned RequiredAlign = VT.getSizeInBits()/8;
	SDValue Chain = LD->getChain();
	// Make sure the stack object alignment is at least 16 or 32.
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
	if (MFI.isFixedObjectIndex(FI)) {
	// Can't change the alignment. FIXME: It's possible to compute
	// the exact stack offset and reference FI + adjust offset instead.
	// If someone really cares about this. That's the way to implement it.
	return SDValue();
	} else {
	MFI.setObjectAlignment(FI, RequiredAlign);
	}
	}

	// (Offset % 16 or 32) must be multiple of 4. Then address is then
	// Ptr + (Offset & ~15).
	if (Offset < 0)
	return SDValue();
	if ((Offset % RequiredAlign) & 3)
	return SDValue();
	int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
	if (StartOffset) {
	SDLoc DL(Ptr);
	Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
	DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
	}

	int EltNo = (Offset - StartOffset) >> 2;
	unsigned NumElems = VT.getVectorNumElements();

	EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
	SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
	LD->getPointerInfo().getWithOffset(StartOffset));

	SmallVector<int, 8> Mask(NumElems, EltNo);

	return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
	}

	return SDValue();
	}

	/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
	/// elements can be replaced by a single large load which has the same value as
	/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
	///
	/// Example: <load i32 a, load i32 a+4, zero, undef> -> zextload a
	static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
	const SDLoc &DL, SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	bool isAfterLegalize) {
	unsigned NumElems = Elts.size();

	int LastLoadedElt = -1;
	SmallBitVector LoadMask(NumElems, false);
	SmallBitVector ZeroMask(NumElems, false);
	SmallBitVector UndefMask(NumElems, false);

	// For each element in the initializer, see if we've found a load, zero or an
	// undef.
	for (unsigned i = 0; i < NumElems; ++i) {
	SDValue Elt = peekThroughBitcasts(Elts[i]);
	if (!Elt.getNode())
	return SDValue();

	if (Elt.isUndef())
	UndefMask[i] = true;
	else if (X86::isZeroNode(Elt) \|\| ISD::isBuildVectorAllZeros(Elt.getNode()))
	ZeroMask[i] = true;
	else if (ISD::isNON_EXTLoad(Elt.getNode())) {
	LoadMask[i] = true;
	LastLoadedElt = i;
	// Each loaded element must be the correct fractional portion of the
	// requested vector load.
	if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
	return SDValue();
	} else
	return SDValue();
	}
	assert((ZeroMask \| UndefMask \| LoadMask).count() == NumElems &&
	"Incomplete element masks");

	// Handle Special Cases - all undef or undef/zero.
	if (UndefMask.count() == NumElems)
	return DAG.getUNDEF(VT);

	// FIXME: Should we return this as a BUILD_VECTOR instead?
	if ((ZeroMask \| UndefMask).count() == NumElems)
	return VT.isInteger() ? DAG.getConstant(0, DL, VT)
	: DAG.getConstantFP(0.0, DL, VT);

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	int FirstLoadedElt = LoadMask.find_first();
	SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
	LoadSDNode *LDBase = cast<LoadSDNode>(EltBase);
	EVT LDBaseVT = EltBase.getValueType();

	// Consecutive loads can contain UNDEFS but not ZERO elements.
	// Consecutive loads with UNDEFs and ZEROs elements require a
	// an additional shuffle stage to clear the ZERO elements.
	bool IsConsecutiveLoad = true;
	bool IsConsecutiveLoadWithZeros = true;
	for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
	if (LoadMask[i]) {
	SDValue Elt = peekThroughBitcasts(Elts[i]);
	LoadSDNode *LD = cast<LoadSDNode>(Elt);
	if (!DAG.areNonVolatileConsecutiveLoads(
	LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8,
	i - FirstLoadedElt)) {
	IsConsecutiveLoad = false;
	IsConsecutiveLoadWithZeros = false;
	break;
	}
	} else if (ZeroMask[i]) {
	IsConsecutiveLoad = false;
	}
	}

	auto CreateLoad = [&DAG, &DL](EVT VT, LoadSDNode *LDBase) {
	auto MMOFlags = LDBase->getMemOperand()->getFlags();
	assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&
	"Cannot merge volatile loads.");
	SDValue NewLd =
	DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
	LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
	DAG.makeEquivalentMemoryOrdering(LDBase, NewLd);
	return NewLd;
	};

	// LOAD - all consecutive load/undefs (must start/end with a load).
	// If we have found an entire vector of loads and undefs, then return a large
	// load of the entire vector width starting at the base pointer.
	// If the vector contains zeros, then attempt to shuffle those elements.
	if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) &&
	(IsConsecutiveLoad \|\| IsConsecutiveLoadWithZeros)) {
	assert(LDBase && "Did not find base load for merging consecutive loads");
	EVT EltVT = LDBase->getValueType(0);
	// Ensure that the input vector size for the merged loads matches the
	// cumulative size of the input elements.
	if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
	return SDValue();

	if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
	return SDValue();

	// Don't create 256-bit non-temporal aligned loads without AVX2 as these
	// will lower to regular temporal loads and use the cache.
	if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&
	VT.is256BitVector() && !Subtarget.hasInt256())
	return SDValue();

	if (IsConsecutiveLoad)
	return CreateLoad(VT, LDBase);

	// IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
	// vector and a zero vector to clear out the zero elements.
	if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) {
	SmallVector<int, 4> ClearMask(NumElems, -1);
	for (unsigned i = 0; i < NumElems; ++i) {
	if (ZeroMask[i])
	ClearMask[i] = i + NumElems;
	else if (LoadMask[i])
	ClearMask[i] = i;
	}
	SDValue V = CreateLoad(VT, LDBase);
	SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
	: DAG.getConstantFP(0.0, DL, VT);
	return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
	}
	}

	int LoadSize =
	(1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits();

	// VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
	if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
	(LoadSize == 32 \|\| LoadSize == 64) &&
	((VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector()))) {
	MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSize)
	: MVT::getIntegerVT(LoadSize);
	MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSize);
	if (TLI.isTypeLegal(VecVT)) {
	SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
	SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
	SDValue ResNode =
	DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
	LDBase->getPointerInfo(),
	LDBase->getAlignment(),
	false/isVolatile/, true/ReadMem/,
	false/WriteMem/);
	DAG.makeEquivalentMemoryOrdering(LDBase, ResNode);
	return DAG.getBitcast(VT, ResNode);
	}
	}

	return SDValue();
	}

	static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
	unsigned SplatBitSize, LLVMContext &C) {
	unsigned ScalarSize = VT.getScalarSizeInBits();
	unsigned NumElm = SplatBitSize / ScalarSize;

	SmallVector<Constant *, 32> ConstantVec;
	for (unsigned i = 0; i < NumElm; i++) {
	APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
	Constant *Const;
	if (VT.isFloatingPoint()) {
	if (ScalarSize == 32) {
	Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
	} else {
	assert(ScalarSize == 64 && "Unsupported floating point scalar size");
	Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
	}
	} else
	Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
	ConstantVec.push_back(Const);
	}
	return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
	}

	static bool isUseOfShuffle(SDNode *N) {
	for (auto *U : N->uses()) {
	if (isTargetShuffle(U->getOpcode()))
	return true;
	if (U->getOpcode() == ISD::BITCAST) // Ignore bitcasts
	return isUseOfShuffle(U);
	}
	return false;
	}

	/// Attempt to use the vbroadcast instruction to generate a splat value
	/// from a splat BUILD_VECTOR which uses:
	/// a. A single scalar load, or a constant.
	/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
	///
	/// The VBROADCAST node is returned when a pattern is found,
	/// or SDValue() otherwise.
	static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	// VBROADCAST requires AVX.
	// TODO: Splats could be generated for non-AVX CPUs using SSE
	// instructions, but there's less potential gain for only 128-bit vectors.
	if (!Subtarget.hasAVX())
	return SDValue();

	MVT VT = BVOp->getSimpleValueType(0);
	SDLoc dl(BVOp);

	assert((VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector()) &&
	"Unsupported vector type for broadcast.");

	BitVector UndefElements;
	SDValue Ld = BVOp->getSplatValue(&UndefElements);

	// We need a splat of a single value to use broadcast, and it doesn't
	// make any sense if the value is only in one element of the vector.
	if (!Ld \|\| (VT.getVectorNumElements() - UndefElements.count()) <= 1) {
	APInt SplatValue, Undef;
	unsigned SplatBitSize;
	bool HasUndef;
	// Check if this is a repeated constant pattern suitable for broadcasting.
	if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
	SplatBitSize > VT.getScalarSizeInBits() &&
	SplatBitSize < VT.getSizeInBits()) {
	// Avoid replacing with broadcast when it's a use of a shuffle
	// instruction to preserve the present custom lowering of shuffles.
	if (isUseOfShuffle(BVOp) \|\| BVOp->hasOneUse())
	return SDValue();
	// replace BUILD_VECTOR with broadcast of the repeated constants.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	LLVMContext *Ctx = DAG.getContext();
	MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
	if (Subtarget.hasAVX()) {
	if (SplatBitSize <= 64 && Subtarget.hasAVX2() &&
	!(SplatBitSize == 64 && Subtarget.is32Bit())) {
	// Splatted value can fit in one INTEGER constant in constant pool.
	// Load the constant and broadcast it.
	MVT CVT = MVT::getIntegerVT(SplatBitSize);
	Type ScalarTy = Type::getIntNTy(Ctx, SplatBitSize);
	Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
	SDValue CP = DAG.getConstantPool(C, PVT);
	unsigned Repeat = VT.getSizeInBits() / SplatBitSize;

	unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
	Ld = DAG.getLoad(
	CVT, dl, DAG.getEntryNode(), CP,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	Alignment);
	SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
	MVT::getVectorVT(CVT, Repeat), Ld);
	return DAG.getBitcast(VT, Brdcst);
	} else if (SplatBitSize == 32 \|\| SplatBitSize == 64) {
	// Splatted value can fit in one FLOAT constant in constant pool.
	// Load the constant and broadcast it.
	// AVX have support for 32 and 64 bit broadcast for floats only.
	// No 64bit integer in 32bit subtarget.
	MVT CVT = MVT::getFloatingPointVT(SplatBitSize);
	// Lower the splat via APFloat directly, to avoid any conversion.
	Constant *C =
	SplatBitSize == 32
	? ConstantFP::get(*Ctx,
	APFloat(APFloat::IEEEsingle(), SplatValue))
	: ConstantFP::get(*Ctx,
	APFloat(APFloat::IEEEdouble(), SplatValue));
	SDValue CP = DAG.getConstantPool(C, PVT);
	unsigned Repeat = VT.getSizeInBits() / SplatBitSize;

	unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
	Ld = DAG.getLoad(
	CVT, dl, DAG.getEntryNode(), CP,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	Alignment);
	SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
	MVT::getVectorVT(CVT, Repeat), Ld);
	return DAG.getBitcast(VT, Brdcst);
	} else if (SplatBitSize > 64) {
	// Load the vector of constants and broadcast it.
	MVT CVT = VT.getScalarType();
	Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
	*Ctx);
	SDValue VCP = DAG.getConstantPool(VecC, PVT);
	unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
	unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment();
	Ld = DAG.getLoad(
	MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	Alignment);
	SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
	return DAG.getBitcast(VT, Brdcst);
	}
	}
	}
	return SDValue();
	}

	bool ConstSplatVal =
	(Ld.getOpcode() == ISD::Constant \|\| Ld.getOpcode() == ISD::ConstantFP);

	// Make sure that all of the users of a non-constant load are from the
	// BUILD_VECTOR node.
	if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
	return SDValue();

	unsigned ScalarSize = Ld.getValueSizeInBits();
	bool IsGE256 = (VT.getSizeInBits() >= 256);

	// When optimizing for size, generate up to 5 extra bytes for a broadcast
	// instruction to save 8 or more bytes of constant pool data.
	// TODO: If multiple splats are generated to load the same constant,
	// it may be detrimental to overall size. There needs to be a way to detect
	// that condition to know if this is truly a size win.
	bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();

	// Handle broadcasting a single constant scalar from the constant pool
	// into a vector.
	// On Sandybridge (no AVX2), it is still better to load a constant vector
	// from the constant pool and not to broadcast it from a scalar.
	// But override that restriction when optimizing for size.
	// TODO: Check if splatting is recommended for other AVX-capable CPUs.
	if (ConstSplatVal && (Subtarget.hasAVX2() \|\| OptForSize)) {
	EVT CVT = Ld.getValueType();
	assert(!CVT.isVector() && "Must not broadcast a vector type");

	// Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
	// For size optimization, also splat v2f64 and v2i64, and for size opt
	// with AVX2, also splat i8 and i16.
	// With pattern matching, the VBROADCAST node may become a VMOVDDUP.
	if (ScalarSize == 32 \|\| (IsGE256 && ScalarSize == 64) \|\|
	(OptForSize && (ScalarSize == 64 \|\| Subtarget.hasAVX2()))) {
	const Constant *C = nullptr;
	if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
	C = CI->getConstantIntValue();
	else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
	C = CF->getConstantFPValue();

	assert(C && "Invalid constant type");

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDValue CP =
	DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
	unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
	Ld = DAG.getLoad(
	CVT, dl, DAG.getEntryNode(), CP,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	Alignment);

	return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
	}
	}

	bool IsLoad = ISD::isNormalLoad(Ld.getNode());

	// Handle AVX2 in-register broadcasts.
	if (!IsLoad && Subtarget.hasInt256() &&
	(ScalarSize == 32 \|\| (IsGE256 && ScalarSize == 64)))
	return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);

	// The scalar source must be a normal load.
	if (!IsLoad)
	return SDValue();

	if (ScalarSize == 32 \|\| (IsGE256 && ScalarSize == 64) \|\|
	(Subtarget.hasVLX() && ScalarSize == 64))
	return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);

	// The integer check is needed for the 64-bit into 128-bit so it doesn't match
	// double since there is no vbroadcastsd xmm
	if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
	if (ScalarSize == 8 \|\| ScalarSize == 16 \|\| ScalarSize == 64)
	return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
	}

	// Unsupported broadcast.
	return SDValue();
	}

	/// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
	/// underlying vector and index.
	///
	/// Modifies \p ExtractedFromVec to the real vector and returns the real
	/// index.
	static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
	SDValue ExtIdx) {
	int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
	if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
	return Idx;

	// For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
	// lowered this:
	// (extract_vector_elt (v8f32 %vreg1), Constant<6>)
	// to:
	// (extract_vector_elt (vector_shuffle<2,u,u,u>
	// (extract_subvector (v8f32 %vreg0), Constant<4>),
	// undef)
	// Constant<0>)
	// In this case the vector is the extract_subvector expression and the index
	// is 2, as specified by the shuffle.
	ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
	SDValue ShuffleVec = SVOp->getOperand(0);
	MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
	assert(ShuffleVecVT.getVectorElementType() ==
	ExtractedFromVec.getSimpleValueType().getVectorElementType());

	int ShuffleIdx = SVOp->getMaskElt(Idx);
	if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
	ExtractedFromVec = ShuffleVec;
	return ShuffleIdx;
	}
	return Idx;
	}

	static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();

	// Skip if insert_vec_elt is not supported.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
	return SDValue();

	SDLoc DL(Op);
	unsigned NumElems = Op.getNumOperands();

	SDValue VecIn1;
	SDValue VecIn2;
	SmallVector<unsigned, 4> InsertIndices;
	SmallVector<int, 8> Mask(NumElems, -1);

	for (unsigned i = 0; i != NumElems; ++i) {
	unsigned Opc = Op.getOperand(i).getOpcode();

	if (Opc == ISD::UNDEF)
	continue;

	if (Opc != ISD::EXTRACT_VECTOR_ELT) {
	// Quit if more than 1 elements need inserting.
	if (InsertIndices.size() > 1)
	return SDValue();

	InsertIndices.push_back(i);
	continue;
	}

	SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
	SDValue ExtIdx = Op.getOperand(i).getOperand(1);

	// Quit if non-constant index.
	if (!isa<ConstantSDNode>(ExtIdx))
	return SDValue();
	int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);

	// Quit if extracted from vector of different type.
	if (ExtractedFromVec.getValueType() != VT)
	return SDValue();

	if (!VecIn1.getNode())
	VecIn1 = ExtractedFromVec;
	else if (VecIn1 != ExtractedFromVec) {
	if (!VecIn2.getNode())
	VecIn2 = ExtractedFromVec;
	else if (VecIn2 != ExtractedFromVec)
	// Quit if more than 2 vectors to shuffle
	return SDValue();
	}

	if (ExtractedFromVec == VecIn1)
	Mask[i] = Idx;
	else if (ExtractedFromVec == VecIn2)
	Mask[i] = Idx + NumElems;
	}

	if (!VecIn1.getNode())
	return SDValue();

	VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
	SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);

	for (unsigned Idx : InsertIndices)
	NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
	DAG.getIntPtrConstant(Idx, DL));

	return NV;
	}

	static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
	assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
	Op.getScalarValueSizeInBits() == 1 &&
	"Can not convert non-constant vector");
	uint64_t Immediate = 0;
	for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
	SDValue In = Op.getOperand(idx);
	if (!In.isUndef())
	Immediate \|= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
	}
	SDLoc dl(Op);
	MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8));
	return DAG.getConstant(Immediate, dl, VT);
	}
	// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
	SDValue
	X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {

	MVT VT = Op.getSimpleValueType();
	assert((VT.getVectorElementType() == MVT::i1) &&
	"Unexpected type in LowerBUILD_VECTORvXi1!");

	SDLoc dl(Op);
	if (ISD::isBuildVectorAllZeros(Op.getNode()))
	return DAG.getTargetConstant(0, dl, VT);

	if (ISD::isBuildVectorAllOnes(Op.getNode()))
	return DAG.getTargetConstant(1, dl, VT);

	if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
	SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
	if (Imm.getValueSizeInBits() == VT.getSizeInBits())
	return DAG.getBitcast(VT, Imm);
	SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
	DAG.getIntPtrConstant(0, dl));
	}

	// Vector has one or more non-const elements
	uint64_t Immediate = 0;
	SmallVector<unsigned, 16> NonConstIdx;
	bool IsSplat = true;
	bool HasConstElts = false;
	int SplatIdx = -1;
	for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
	SDValue In = Op.getOperand(idx);
	if (In.isUndef())
	continue;
	if (!isa<ConstantSDNode>(In))
	NonConstIdx.push_back(idx);
	else {
	Immediate \|= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
	HasConstElts = true;
	}
	if (SplatIdx < 0)
	SplatIdx = idx;
	else if (In != Op.getOperand(SplatIdx))
	IsSplat = false;
	}

	// for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
	if (IsSplat)
	return DAG.getSelect(dl, VT, Op.getOperand(SplatIdx),
	DAG.getConstant(1, dl, VT),
	DAG.getConstant(0, dl, VT));

	// insert elements one by one
	SDValue DstVec;
	SDValue Imm;
	if (Immediate) {
	MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
	Imm = DAG.getConstant(Immediate, dl, ImmVT);
	}
	else if (HasConstElts)
	Imm = DAG.getConstant(0, dl, VT);
	else
	Imm = DAG.getUNDEF(VT);
	if (Imm.getValueSizeInBits() == VT.getSizeInBits())
	DstVec = DAG.getBitcast(VT, Imm);
	else {
	SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
	DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
	DAG.getIntPtrConstant(0, dl));
	}

	for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
	unsigned InsertIdx = NonConstIdx[i];
	DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
	Op.getOperand(InsertIdx),
	DAG.getIntPtrConstant(InsertIdx, dl));
	}
	return DstVec;
	}

	/// \brief Return true if \p N implements a horizontal binop and return the
	/// operands for the horizontal binop into V0 and V1.
	///
	/// This is a helper function of LowerToHorizontalOp().
	/// This function checks that the build_vector \p N in input implements a
	/// horizontal operation. Parameter \p Opcode defines the kind of horizontal
	/// operation to match.
	/// For example, if \p Opcode is equal to ISD::ADD, then this function
	/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
	/// is equal to ISD::SUB, then this function checks if this is a horizontal
	/// arithmetic sub.
	///
	/// This function only analyzes elements of \p N whose indices are
	/// in range [BaseIdx, LastIdx).
	static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
	SelectionDAG &DAG,
	unsigned BaseIdx, unsigned LastIdx,
	SDValue &V0, SDValue &V1) {
	EVT VT = N->getValueType(0);

	assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
	assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
	"Invalid Vector in input!");

	bool IsCommutable = (Opcode == ISD::ADD \|\| Opcode == ISD::FADD);
	bool CanFold = true;
	unsigned ExpectedVExtractIdx = BaseIdx;
	unsigned NumElts = LastIdx - BaseIdx;
	V0 = DAG.getUNDEF(VT);
	V1 = DAG.getUNDEF(VT);

	// Check if N implements a horizontal binop.
	for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
	SDValue Op = N->getOperand(i + BaseIdx);

	// Skip UNDEFs.
	if (Op->isUndef()) {
	// Update the expected vector extract index.
	if (i * 2 == NumElts)
	ExpectedVExtractIdx = BaseIdx;
	ExpectedVExtractIdx += 2;
	continue;
	}

	CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();

	if (!CanFold)
	break;

	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);

	// Try to match the following pattern:
	// (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
	CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	Op0.getOperand(0) == Op1.getOperand(0) &&
	isa<ConstantSDNode>(Op0.getOperand(1)) &&
	isa<ConstantSDNode>(Op1.getOperand(1)));
	if (!CanFold)
	break;

	unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
	unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();

	if (i * 2 < NumElts) {
	if (V0.isUndef()) {
	V0 = Op0.getOperand(0);
	if (V0.getValueType() != VT)
	return false;
	}
	} else {
	if (V1.isUndef()) {
	V1 = Op0.getOperand(0);
	if (V1.getValueType() != VT)
	return false;
	}
	if (i * 2 == NumElts)
	ExpectedVExtractIdx = BaseIdx;
	}

	SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
	if (I0 == ExpectedVExtractIdx)
	CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
	else if (IsCommutable && I1 == ExpectedVExtractIdx) {
	// Try to match the following dag sequence:
	// (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
	CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
	} else
	CanFold = false;

	ExpectedVExtractIdx += 2;
	}

	return CanFold;
	}

	/// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
	/// a concat_vector.
	///
	/// This is a helper function of LowerToHorizontalOp().
	/// This function expects two 256-bit vectors called V0 and V1.
	/// At first, each vector is split into two separate 128-bit vectors.
	/// Then, the resulting 128-bit vectors are used to implement two
	/// horizontal binary operations.
	///
	/// The kind of horizontal binary operation is defined by \p X86Opcode.
	///
	/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
	/// the two new horizontal binop.
	/// When Mode is set, the first horizontal binop dag node would take as input
	/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
	/// horizontal binop dag node would take as input the lower 128-bit of V1
	/// and the upper 128-bit of V1.
	/// Example:
	/// HADD V0_LO, V0_HI
	/// HADD V1_LO, V1_HI
	///
	/// Otherwise, the first horizontal binop dag node takes as input the lower
	/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
	/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
	/// Example:
	/// HADD V0_LO, V1_LO
	/// HADD V0_HI, V1_HI
	///
	/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
	/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
	/// the upper 128-bits of the result.
	static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
	const SDLoc &DL, SelectionDAG &DAG,
	unsigned X86Opcode, bool Mode,
	bool isUndefLO, bool isUndefHI) {
	MVT VT = V0.getSimpleValueType();
	assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
	"Invalid nodes in input!");

	unsigned NumElts = VT.getVectorNumElements();
	SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
	SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
	SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
	SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
	MVT NewVT = V0_LO.getSimpleValueType();

	SDValue LO = DAG.getUNDEF(NewVT);
	SDValue HI = DAG.getUNDEF(NewVT);

	if (Mode) {
	// Don't emit a horizontal binop if the result is expected to be UNDEF.
	if (!isUndefLO && !V0->isUndef())
	LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
	if (!isUndefHI && !V1->isUndef())
	HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
	} else {
	// Don't emit a horizontal binop if the result is expected to be UNDEF.
	if (!isUndefLO && (!V0_LO->isUndef() \|\| !V1_LO->isUndef()))
	LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);

	if (!isUndefHI && (!V0_HI->isUndef() \|\| !V1_HI->isUndef()))
	HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
	}

	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
	}

	/// Returns true iff \p BV builds a vector with the result equivalent to
	/// the result of ADDSUB operation.
	/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1 operation
	/// are written to the parameters \p Opnd0 and \p Opnd1.
	static bool isAddSub(const BuildVectorSDNode *BV,
	const X86Subtarget &Subtarget, SelectionDAG &DAG,
	SDValue &Opnd0, SDValue &Opnd1) {

	MVT VT = BV->getSimpleValueType(0);
	if ((!Subtarget.hasSSE3() \|\| (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
	(!Subtarget.hasAVX() \|\| (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
	(!Subtarget.hasAVX512() \|\| (VT != MVT::v16f32 && VT != MVT::v8f64)))
	return false;

	unsigned NumElts = VT.getVectorNumElements();
	SDValue InVec0 = DAG.getUNDEF(VT);
	SDValue InVec1 = DAG.getUNDEF(VT);

	// Odd-numbered elements in the input build vector are obtained from
	// adding two integer/float elements.
	// Even-numbered elements in the input build vector are obtained from
	// subtracting two integer/float elements.
	unsigned ExpectedOpcode = ISD::FSUB;
	unsigned NextExpectedOpcode = ISD::FADD;
	bool AddFound = false;
	bool SubFound = false;

	for (unsigned i = 0, e = NumElts; i != e; ++i) {
	SDValue Op = BV->getOperand(i);

	// Skip 'undef' values.
	unsigned Opcode = Op.getOpcode();
	if (Opcode == ISD::UNDEF) {
	std::swap(ExpectedOpcode, NextExpectedOpcode);
	continue;
	}

	// Early exit if we found an unexpected opcode.
	if (Opcode != ExpectedOpcode)
	return false;

	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);

	// Try to match the following pattern:
	// (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
	// Early exit if we cannot match that sequence.
	if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	!isa<ConstantSDNode>(Op0.getOperand(1)) \|\|
	!isa<ConstantSDNode>(Op1.getOperand(1)) \|\|
	Op0.getOperand(1) != Op1.getOperand(1))
	return false;

	unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
	if (I0 != i)
	return false;

	// We found a valid add/sub node. Update the information accordingly.
	if (i & 1)
	AddFound = true;
	else
	SubFound = true;

	// Update InVec0 and InVec1.
	if (InVec0.isUndef()) {
	InVec0 = Op0.getOperand(0);
	if (InVec0.getSimpleValueType() != VT)
	return false;
	}
	if (InVec1.isUndef()) {
	InVec1 = Op1.getOperand(0);
	if (InVec1.getSimpleValueType() != VT)
	return false;
	}

	// Make sure that operands in input to each add/sub node always
	// come from a same pair of vectors.
	if (InVec0 != Op0.getOperand(0)) {
	if (ExpectedOpcode == ISD::FSUB)
	return false;

	// FADD is commutable. Try to commute the operands
	// and then test again.
	std::swap(Op0, Op1);
	if (InVec0 != Op0.getOperand(0))
	return false;
	}

	if (InVec1 != Op1.getOperand(0))
	return false;

	// Update the pair of expected opcodes.
	std::swap(ExpectedOpcode, NextExpectedOpcode);
	}

	// Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
	if (!AddFound \|\| !SubFound \|\| InVec0.isUndef() \|\| InVec1.isUndef())
	return false;

	Opnd0 = InVec0;
	Opnd1 = InVec1;
	return true;
	}

	/// Returns true if is possible to fold MUL and an idiom that has already been
	/// recognized as ADDSUB(\p Opnd0, \p Opnd1) into FMADDSUB(x, y, \p Opnd1).
	/// If (and only if) true is returned, the operands of FMADDSUB are written to
	/// parameters \p Opnd0, \p Opnd1, \p Opnd2.
	///
	/// Prior to calling this function it should be known that there is some
	/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
	/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
	/// before replacement of such SDNode with ADDSUB operation. Thus the number
	/// of \p Opnd0 uses is expected to be equal to 2.
	/// For example, this function may be called for the following IR:
	/// %AB = fmul fast <2 x double> %A, %B
	/// %Sub = fsub fast <2 x double> %AB, %C
	/// %Add = fadd fast <2 x double> %AB, %C
	/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
	/// <2 x i32> <i32 0, i32 3>
	/// There is a def for %Addsub here, which potentially can be replaced by
	/// X86ISD::ADDSUB operation:
	/// %Addsub = X86ISD::ADDSUB %AB, %C
	/// and such ADDSUB can further be replaced with FMADDSUB:
	/// %Addsub = FMADDSUB %A, %B, %C.
	///
	/// The main reason why this method is called before the replacement of the
	/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
	/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
	/// FMADDSUB is.
	static bool isFMAddSub(const X86Subtarget &Subtarget, SelectionDAG &DAG,
	SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2) {
	if (Opnd0.getOpcode() != ISD::FMUL \|\| Opnd0->use_size() != 2 \|\|
	!Subtarget.hasAnyFMA())
	return false;

	// FIXME: These checks must match the similar ones in
	// DAGCombiner::visitFADDForFMACombine. It would be good to have one
	// function that would answer if it is Ok to fuse MUL + ADD to FMADD
	// or MUL + ADDSUB to FMADDSUB.
	const TargetOptions &Options = DAG.getTarget().Options;
	bool AllowFusion =
	(Options.AllowFPOpFusion == FPOpFusion::Fast \|\| Options.UnsafeFPMath);
	if (!AllowFusion)
	return false;

	Opnd2 = Opnd1;
	Opnd1 = Opnd0.getOperand(1);
	Opnd0 = Opnd0.getOperand(0);

	return true;
	}

	/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' operation
	/// accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB node.
	static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue Opnd0, Opnd1;
	if (!isAddSub(BV, Subtarget, DAG, Opnd0, Opnd1))
	return SDValue();

	MVT VT = BV->getSimpleValueType(0);
	SDLoc DL(BV);

	// Try to generate X86ISD::FMADDSUB node here.
	SDValue Opnd2;
	if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2))
	return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);

	// Do not generate X86ISD::ADDSUB node for 512-bit types even though
	// the ADDSUB idiom has been successfully recognized. There are no known
	// X86 targets with 512-bit ADDSUB instructions!
	// 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
	// recognition.
	if (VT.is512BitVector())
	return SDValue();

	return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
	}

	/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
	static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = BV->getSimpleValueType(0);
	unsigned NumElts = VT.getVectorNumElements();
	unsigned NumUndefsLO = 0;
	unsigned NumUndefsHI = 0;
	unsigned Half = NumElts/2;

	// Count the number of UNDEF operands in the build_vector in input.
	for (unsigned i = 0, e = Half; i != e; ++i)
	if (BV->getOperand(i)->isUndef())
	NumUndefsLO++;

	for (unsigned i = Half, e = NumElts; i != e; ++i)
	if (BV->getOperand(i)->isUndef())
	NumUndefsHI++;

	// Early exit if this is either a build_vector of all UNDEFs or all the
	// operands but one are UNDEF.
	if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
	return SDValue();

	SDLoc DL(BV);
	SDValue InVec0, InVec1;
	if ((VT == MVT::v4f32 \|\| VT == MVT::v2f64) && Subtarget.hasSSE3()) {
	// Try to match an SSE3 float HADD/HSUB.
	if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
	return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);

	if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
	return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
	} else if ((VT == MVT::v4i32 \|\| VT == MVT::v8i16) && Subtarget.hasSSSE3()) {
	// Try to match an SSSE3 integer HADD/HSUB.
	if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
	return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);

	if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
	return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
	}

	if (!Subtarget.hasAVX())
	return SDValue();

	if ((VT == MVT::v8f32 \|\| VT == MVT::v4f64)) {
	// Try to match an AVX horizontal add/sub of packed single/double
	// precision floating point values from 256-bit vectors.
	SDValue InVec2, InVec3;
	if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
	isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
	((InVec0.isUndef() \|\| InVec2.isUndef()) \|\| InVec0 == InVec2) &&
	((InVec1.isUndef() \|\| InVec3.isUndef()) \|\| InVec1 == InVec3))
	return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);

	if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
	isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
	((InVec0.isUndef() \|\| InVec2.isUndef()) \|\| InVec0 == InVec2) &&
	((InVec1.isUndef() \|\| InVec3.isUndef()) \|\| InVec1 == InVec3))
	return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
	} else if (VT == MVT::v8i32 \|\| VT == MVT::v16i16) {
	// Try to match an AVX2 horizontal add/sub of signed integers.
	SDValue InVec2, InVec3;
	unsigned X86Opcode;
	bool CanFold = true;

	if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
	isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
	((InVec0.isUndef() \|\| InVec2.isUndef()) \|\| InVec0 == InVec2) &&
	((InVec1.isUndef() \|\| InVec3.isUndef()) \|\| InVec1 == InVec3))
	X86Opcode = X86ISD::HADD;
	else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
	isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
	((InVec0.isUndef() \|\| InVec2.isUndef()) \|\| InVec0 == InVec2) &&
	((InVec1.isUndef() \|\| InVec3.isUndef()) \|\| InVec1 == InVec3))
	X86Opcode = X86ISD::HSUB;
	else
	CanFold = false;

	if (CanFold) {
	// Fold this build_vector into a single horizontal add/sub.
	// Do this only if the target has AVX2.
	if (Subtarget.hasAVX2())
	return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);

	// Do not try to expand this build_vector into a pair of horizontal
	// add/sub if we can emit a pair of scalar add/sub.
	if (NumUndefsLO + 1 == Half \|\| NumUndefsHI + 1 == Half)
	return SDValue();

	// Convert this build_vector into a pair of horizontal binop followed by
	// a concat vector.
	bool isUndefLO = NumUndefsLO == Half;
	bool isUndefHI = NumUndefsHI == Half;
	return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
	isUndefLO, isUndefHI);
	}
	}

	if ((VT == MVT::v8f32 \|\| VT == MVT::v4f64 \|\| VT == MVT::v8i32 \|\|
	VT == MVT::v16i16) && Subtarget.hasAVX()) {
	unsigned X86Opcode;
	if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
	X86Opcode = X86ISD::HADD;
	else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
	X86Opcode = X86ISD::HSUB;
	else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
	X86Opcode = X86ISD::FHADD;
	else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
	X86Opcode = X86ISD::FHSUB;
	else
	return SDValue();

	// Don't try to expand this build_vector into a pair of horizontal add/sub
	// if we can simply emit a pair of scalar add/sub.
	if (NumUndefsLO + 1 == Half \|\| NumUndefsHI + 1 == Half)
	return SDValue();

	// Convert this build_vector into two horizontal add/sub followed by
	// a concat vector.
	bool isUndefLO = NumUndefsLO == Half;
	bool isUndefHI = NumUndefsHI == Half;
	return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
	isUndefLO, isUndefHI);
	}

	return SDValue();
	}

	/// If a BUILD_VECTOR's source elements all apply the same bit operation and
	/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
	/// just apply the bit to the vectors.
	/// NOTE: Its not in our interest to start make a general purpose vectorizer
	/// from this, but enough scalar bit operations are created from the later
	/// legalization + scalarization stages to need basic support.
	static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
	SelectionDAG &DAG) {
	SDLoc DL(Op);
	MVT VT = Op->getSimpleValueType(0);
	unsigned NumElems = VT.getVectorNumElements();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// Check that all elements have the same opcode.
	// TODO: Should we allow UNDEFS and if so how many?
	unsigned Opcode = Op->getOperand(0).getOpcode();
	for (unsigned i = 1; i < NumElems; ++i)
	if (Opcode != Op->getOperand(i).getOpcode())
	return SDValue();

	// TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
	switch (Opcode) {
	default:
	return SDValue();
	case ISD::AND:
	case ISD::XOR:
	case ISD::OR:
	if (!TLI.isOperationLegalOrPromote(Opcode, VT))
	return SDValue();
	break;
	}

	SmallVector<SDValue, 4> LHSElts, RHSElts;
	for (SDValue Elt : Op->ops()) {
	SDValue LHS = Elt.getOperand(0);
	SDValue RHS = Elt.getOperand(1);

	// We expect the canonicalized RHS operand to be the constant.
	if (!isa<ConstantSDNode>(RHS))
	return SDValue();
	LHSElts.push_back(LHS);
	RHSElts.push_back(RHS);
	}

	SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
	SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
	return DAG.getNode(Opcode, DL, VT, LHS, RHS);
	}

	/// Create a vector constant without a load. SSE/AVX provide the bare minimum
	/// functionality to do this, so it's all zeros, all ones, or some derivation
	/// that is cheap to calculate.
	static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc DL(Op);
	MVT VT = Op.getSimpleValueType();

	// Vectors containing all zeros can be matched by pxor and xorps.
	if (ISD::isBuildVectorAllZeros(Op.getNode())) {
	// Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
	// and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
	if (VT == MVT::v4i32 \|\| VT == MVT::v8i32 \|\| VT == MVT::v16i32)
	return Op;

	return getZeroVector(VT, Subtarget, DAG, DL);
	}

	// Vectors containing all ones can be matched by pcmpeqd on 128-bit width
	// vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
	// vpcmpeqd on 256-bit vectors.
	if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
	if (VT == MVT::v4i32 \|\| VT == MVT::v16i32 \|\|
	(VT == MVT::v8i32 && Subtarget.hasInt256()))
	return Op;

	return getOnesVector(VT, DAG, DL);
	}

	return SDValue();
	}

	SDValue
	X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
	SDLoc dl(Op);

	MVT VT = Op.getSimpleValueType();
	MVT ExtVT = VT.getVectorElementType();
	unsigned NumElems = Op.getNumOperands();

	// Generate vectors for predicate vectors.
	if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
	return LowerBUILD_VECTORvXi1(Op, DAG);

	if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
	return VectorConstant;

	BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
	if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
	return AddSub;
	if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
	return HorizontalOp;
	if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
	return Broadcast;
	if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))
	return BitOp;

	unsigned EVTBits = ExtVT.getSizeInBits();

	unsigned NumZero = 0;
	unsigned NumNonZero = 0;
	uint64_t NonZeros = 0;
	bool IsAllConstants = true;
	SmallSet<SDValue, 8> Values;
	for (unsigned i = 0; i < NumElems; ++i) {
	SDValue Elt = Op.getOperand(i);
	if (Elt.isUndef())
	continue;
	Values.insert(Elt);
	if (Elt.getOpcode() != ISD::Constant &&
	Elt.getOpcode() != ISD::ConstantFP)
	IsAllConstants = false;
	if (X86::isZeroNode(Elt))
	NumZero++;
	else {
	assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
	NonZeros \|= ((uint64_t)1 << i);
	NumNonZero++;
	}
	}

	// All undef vector. Return an UNDEF. All zero vectors were handled above.
	if (NumNonZero == 0)
	return DAG.getUNDEF(VT);

	// Special case for single non-zero, non-undef, element.
	if (NumNonZero == 1) {
	unsigned Idx = countTrailingZeros(NonZeros);
	SDValue Item = Op.getOperand(Idx);

	// If this is an insertion of an i64 value on x86-32, and if the top bits of
	// the value are obviously zero, truncate the value to i32 and do the
	// insertion that way. Only do this if the value is non-constant or if the
	// value is a constant being inserted into element 0. It is cheaper to do
	// a constant pool load than it is to do a movd + shuffle.
	if (ExtVT == MVT::i64 && !Subtarget.is64Bit() &&
	(!IsAllConstants \|\| Idx == 0)) {
	if (DAG.MaskedValueIsZero(Item, APInt::getHighBitsSet(64, 32))) {
	// Handle SSE only.
	assert(VT == MVT::v2i64 && "Expected an SSE value type!");
	MVT VecVT = MVT::v4i32;

	// Truncate the value (which may itself be a constant) to i32, and
	// convert it to a vector with movd (S2V+shuffle to zero extend).
	Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
	Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
	return DAG.getBitcast(VT, getShuffleVectorZeroOrUndef(
	Item, Idx * 2, true, Subtarget, DAG));
	}
	}

	// If we have a constant or non-constant insertion into the low element of
	// a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
	// the rest of the elements. This will be matched as movd/movq/movss/movsd
	// depending on what the source datatype is.
	if (Idx == 0) {
	if (NumZero == 0)
	return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);

	if (ExtVT == MVT::i32 \|\| ExtVT == MVT::f32 \|\| ExtVT == MVT::f64 \|\|
	(ExtVT == MVT::i64 && Subtarget.is64Bit())) {
	assert((VT.is128BitVector() \|\| VT.is256BitVector() \|\|
	VT.is512BitVector()) &&
	"Expected an SSE value type!");
	Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
	// Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
	return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
	}

	// We can't directly insert an i8 or i16 into a vector, so zero extend
	// it to i32 first.
	if (ExtVT == MVT::i16 \|\| ExtVT == MVT::i8) {
	Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
	if (VT.getSizeInBits() >= 256) {
	MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
	if (Subtarget.hasAVX()) {
	Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
	Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
	} else {
	// Without AVX, we need to extend to a 128-bit vector and then
	// insert into the 256-bit vector.
	Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
	SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl);
	Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl);
	}
	} else {
	assert(VT.is128BitVector() && "Expected an SSE value type!");
	Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
	Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
	}
	return DAG.getBitcast(VT, Item);
	}
	}

	// Is it a vector logical left shift?
	if (NumElems == 2 && Idx == 1 &&
	X86::isZeroNode(Op.getOperand(0)) &&
	!X86::isZeroNode(Op.getOperand(1))) {
	unsigned NumBits = VT.getSizeInBits();
	return getVShift(true, VT,
	DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
	VT, Op.getOperand(1)),
	NumBits/2, DAG, *this, dl);
	}

	if (IsAllConstants) // Otherwise, it's better to do a constpool load.
	return SDValue();

	// Otherwise, if this is a vector with i32 or f32 elements, and the element
	// is a non-constant being inserted into an element other than the low one,
	// we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
	// movd/movss) to move this into the low element, then shuffle it into
	// place.
	if (EVTBits == 32) {
	Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
	return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
	}
	}

	// Splat is obviously ok. Let legalizer expand it to a shuffle.
	if (Values.size() == 1) {
	if (EVTBits == 32) {
	// Instead of a shuffle like this:
	// shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
	// Check if it's possible to issue this instead.
	// shuffle (vload ptr)), undef, <1, 1, 1, 1>
	unsigned Idx = countTrailingZeros(NonZeros);
	SDValue Item = Op.getOperand(Idx);
	if (Op.getNode()->isOnlyUserOf(Item.getNode()))
	return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
	}
	return SDValue();
	}

	// A vector full of immediates; various special cases are already
	// handled, so this is best done with a single constant-pool load.
	if (IsAllConstants)
	return SDValue();

	// See if we can use a vector load to get all of the elements.
	if (VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector()) {
	SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
	if (SDValue LD =
	EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
	return LD;
	}

	// For AVX-length vectors, build the individual 128-bit pieces and use
	// shuffles to put them in place.
	if (VT.is256BitVector() \|\| VT.is512BitVector()) {
	SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);

	EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);

	// Build both the lower and upper subvector.
	SDValue Lower =
	DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElems / 2));
	SDValue Upper = DAG.getBuildVector(
	HVT, dl, makeArrayRef(&Ops[NumElems / 2], NumElems / 2));

	// Recreate the wider vector with the lower and upper part.
	if (VT.is256BitVector())
	return concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
	return concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
	}

	// Let legalizer expand 2-wide build_vectors.
	if (EVTBits == 64) {
	if (NumNonZero == 1) {
	// One half is zero or undef.
	unsigned Idx = countTrailingZeros(NonZeros);
	SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
	Op.getOperand(Idx));
	return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
	}
	return SDValue();
	}

	// If element VT is < 32 bits, convert it to inserts into a zero vector.
	if (EVTBits == 8 && NumElems == 16)
	if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
	DAG, Subtarget))
	return V;

	if (EVTBits == 16 && NumElems == 8)
	if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
	DAG, Subtarget))
	return V;

	// If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
	if (EVTBits == 32 && NumElems == 4)
	if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
	return V;

	// If element VT is == 32 bits, turn it into a number of shuffles.
	if (NumElems == 4 && NumZero > 0) {
	SmallVector<SDValue, 8> Ops(NumElems);
	for (unsigned i = 0; i < 4; ++i) {
	bool isZero = !(NonZeros & (1ULL << i));
	if (isZero)
	Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
	else
	Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
	}

	for (unsigned i = 0; i < 2; ++i) {
	switch ((NonZeros & (0x3 << i2)) >> (i2)) {
	default: break;
	case 0:
	Ops[i] = Ops[i*2]; // Must be a zero vector.
	break;
	case 1:
	Ops[i] = getMOVL(DAG, dl, VT, Ops[i2+1], Ops[i2]);
	break;
	case 2:
	Ops[i] = getMOVL(DAG, dl, VT, Ops[i2], Ops[i2+1]);
	break;
	case 3:
	Ops[i] = getUnpackl(DAG, dl, VT, Ops[i2], Ops[i2+1]);
	break;
	}
	}

	bool Reverse1 = (NonZeros & 0x3) == 2;
	bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
	int MaskVec[] = {
	Reverse1 ? 1 : 0,
	Reverse1 ? 0 : 1,
	static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
	static_cast<int>(Reverse2 ? NumElems : NumElems+1)
	};
	return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
	}

	if (Values.size() > 1 && VT.is128BitVector()) {
	// Check for a build vector from mostly shuffle plus few inserting.
	if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
	return Sh;

	// For SSE 4.1, use insertps to put the high elements into the low element.
	if (Subtarget.hasSSE41()) {
	SDValue Result;
	if (!Op.getOperand(0).isUndef())
	Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
	else
	Result = DAG.getUNDEF(VT);

	for (unsigned i = 1; i < NumElems; ++i) {
	if (Op.getOperand(i).isUndef()) continue;
	Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
	Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
	}
	return Result;
	}

	// Otherwise, expand into a number of unpckl*, start by extending each of
	// our (non-undef) elements to the full vector width with the element in the
	// bottom slot of the vector (which generates no code for SSE).
	SmallVector<SDValue, 8> Ops(NumElems);
	for (unsigned i = 0; i < NumElems; ++i) {
	if (!Op.getOperand(i).isUndef())
	Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
	else
	Ops[i] = DAG.getUNDEF(VT);
	}

	// Next, we iteratively mix elements, e.g. for v4f32:
	// Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
	// : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
	// Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
	for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
	// Generate scaled UNPCKL shuffle mask.
	SmallVector<int, 16> Mask;
	for(unsigned i = 0; i != Scale; ++i)
	Mask.push_back(i);
	for (unsigned i = 0; i != Scale; ++i)
	Mask.push_back(NumElems+i);
	Mask.append(NumElems - Mask.size(), SM_SentinelUndef);

	for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
	Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2i], Ops[(2i)+1], Mask);
	}
	return Ops[0];
	}
	return SDValue();
	}

	// 256-bit AVX can use the vinsertf128 instruction
	// to create 256-bit vectors from two other 128-bit ones.
	static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
	SDLoc dl(Op);
	MVT ResVT = Op.getSimpleValueType();

	assert((ResVT.is256BitVector() \|\|
	ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");

	SDValue V1 = Op.getOperand(0);
	SDValue V2 = Op.getOperand(1);
	unsigned NumElems = ResVT.getVectorNumElements();
	if (ResVT.is256BitVector())
	return concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);

	if (Op.getNumOperands() == 4) {
	MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
	ResVT.getVectorNumElements()/2);
	SDValue V3 = Op.getOperand(2);
	SDValue V4 = Op.getOperand(3);
	return concat256BitVectors(
	concat128BitVectors(V1, V2, HalfVT, NumElems / 2, DAG, dl),
	concat128BitVectors(V3, V4, HalfVT, NumElems / 2, DAG, dl), ResVT,
	NumElems, DAG, dl);
	}
	return concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
	}

	// Return true if all the operands of the given CONCAT_VECTORS node are zeros
	// except for the first one. (CONCAT_VECTORS Op, 0, 0,...,0)
	static bool isExpandWithZeros(const SDValue &Op) {
	assert(Op.getOpcode() == ISD::CONCAT_VECTORS &&
	"Expand with zeros only possible in CONCAT_VECTORS nodes!");

	for (unsigned i = 1; i < Op.getNumOperands(); i++)
	if (!ISD::isBuildVectorAllZeros(Op.getOperand(i).getNode()))
	return false;

	return true;
	}

	// Returns true if the given node is a type promotion (by concatenating i1
	// zeros) of the result of a node that already zeros all upper bits of
	// k-register.
	static SDValue isTypePromotionOfi1ZeroUpBits(SDValue Op) {
	unsigned Opc = Op.getOpcode();

	assert(Opc == ISD::CONCAT_VECTORS &&
	Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
	"Unexpected node to check for type promotion!");

	// As long as we are concatenating zeros to the upper part of a previous node
	// result, climb up the tree until a node with different opcode is
	// encountered
	while (Opc == ISD::INSERT_SUBVECTOR \|\| Opc == ISD::CONCAT_VECTORS) {
	if (Opc == ISD::INSERT_SUBVECTOR) {
	if (ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()) &&
	Op.getConstantOperandVal(2) == 0)
	Op = Op.getOperand(1);
	else
	return SDValue();
	} else { // Opc == ISD::CONCAT_VECTORS
	if (isExpandWithZeros(Op))
	Op = Op.getOperand(0);
	else
	return SDValue();
	}
	Opc = Op.getOpcode();
	}

	// Check if the first inserted node zeroes the upper bits, or an 'and' result
	// of a node that zeros the upper bits (its masked version).
	if (isMaskedZeroUpperBitsvXi1(Op.getOpcode()) \|\|
	(Op.getOpcode() == ISD::AND &&
	(isMaskedZeroUpperBitsvXi1(Op.getOperand(0).getOpcode()) \|\|
	isMaskedZeroUpperBitsvXi1(Op.getOperand(1).getOpcode())))) {
	return Op;
	}

	return SDValue();
	}

	static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG & DAG) {
	SDLoc dl(Op);
	MVT ResVT = Op.getSimpleValueType();
	unsigned NumOfOperands = Op.getNumOperands();

	assert(isPowerOf2_32(NumOfOperands) &&
	"Unexpected number of operands in CONCAT_VECTORS");

	// If this node promotes - by concatenating zeroes - the type of the result
	// of a node with instruction that zeroes all upper (irrelevant) bits of the
	// output register, mark it as legal and catch the pattern in instruction
	// selection to avoid emitting extra insturctions (for zeroing upper bits).
	if (SDValue Promoted = isTypePromotionOfi1ZeroUpBits(Op)) {
	SDValue ZeroC = DAG.getConstant(0, dl, MVT::i64);
	SDValue AllZeros = DAG.getSplatBuildVector(ResVT, dl, ZeroC);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, AllZeros, Promoted,
	ZeroC);
	}

	SDValue Undef = DAG.getUNDEF(ResVT);
	if (NumOfOperands > 2) {
	// Specialize the cases when all, or all but one, of the operands are undef.
	unsigned NumOfDefinedOps = 0;
	unsigned OpIdx = 0;
	for (unsigned i = 0; i < NumOfOperands; i++)
	if (!Op.getOperand(i).isUndef()) {
	NumOfDefinedOps++;
	OpIdx = i;
	}
	if (NumOfDefinedOps == 0)
	return Undef;
	if (NumOfDefinedOps == 1) {
	unsigned SubVecNumElts =
	Op.getOperand(OpIdx).getValueType().getVectorNumElements();
	SDValue IdxVal = DAG.getIntPtrConstant(SubVecNumElts * OpIdx, dl);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef,
	Op.getOperand(OpIdx), IdxVal);
	}

	MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
	ResVT.getVectorNumElements()/2);
	SmallVector<SDValue, 2> Ops;
	for (unsigned i = 0; i < NumOfOperands/2; i++)
	Ops.push_back(Op.getOperand(i));
	SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
	Ops.clear();
	for (unsigned i = NumOfOperands/2; i < NumOfOperands; i++)
	Ops.push_back(Op.getOperand(i));
	SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
	}

	// 2 operands
	SDValue V1 = Op.getOperand(0);
	SDValue V2 = Op.getOperand(1);
	unsigned NumElems = ResVT.getVectorNumElements();
	assert(V1.getValueType() == V2.getValueType() &&
	V1.getValueType().getVectorNumElements() == NumElems/2 &&
	"Unexpected operands in CONCAT_VECTORS");

	if (ResVT.getSizeInBits() >= 16)
	return Op; // The operation is legal with KUNPCK

	bool IsZeroV1 = ISD::isBuildVectorAllZeros(V1.getNode());
	bool IsZeroV2 = ISD::isBuildVectorAllZeros(V2.getNode());
	SDValue ZeroVec = getZeroVector(ResVT, Subtarget, DAG, dl);
	if (IsZeroV1 && IsZeroV2)
	return ZeroVec;

	SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
	if (V2.isUndef())
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
	if (IsZeroV2)
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V1, ZeroIdx);

	SDValue IdxVal = DAG.getIntPtrConstant(NumElems/2, dl);
	if (V1.isUndef())
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal);

	if (IsZeroV1)
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V2, IdxVal);

	V1 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, V1, V2, IdxVal);
	}

	static SDValue LowerCONCAT_VECTORS(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	if (VT.getVectorElementType() == MVT::i1)
	return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);

	assert((VT.is256BitVector() && Op.getNumOperands() == 2) \|\|
	(VT.is512BitVector() && (Op.getNumOperands() == 2 \|\|
	Op.getNumOperands() == 4)));

	// AVX can use the vinsertf128 instruction to create 256-bit vectors
	// from two other 128-bit ones.

	// 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
	return LowerAVXCONCAT_VECTORS(Op, DAG);
	}

	//===----------------------------------------------------------------------===//
	// Vector shuffle lowering
	//
	// This is an experimental code path for lowering vector shuffles on x86. It is
	// designed to handle arbitrary vector shuffles and blends, gracefully
	// degrading performance as necessary. It works hard to recognize idiomatic
	// shuffles and lower them to optimal instruction patterns without leaving
	// a framework that allows reasonably efficient handling of all vector shuffle
	// patterns.
	//===----------------------------------------------------------------------===//

	/// \brief Tiny helper function to identify a no-op mask.
	///
	/// This is a somewhat boring predicate function. It checks whether the mask
	/// array input, which is assumed to be a single-input shuffle mask of the kind
	/// used by the X86 shuffle instructions (not a fully general
	/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
	/// in-place shuffle are 'no-op's.
	static bool isNoopShuffleMask(ArrayRef<int> Mask) {
	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
	assert(Mask[i] >= -1 && "Out of bound mask element!");
	if (Mask[i] >= 0 && Mask[i] != i)
	return false;
	}
	return true;
	}

	/// \brief Test whether there are elements crossing 128-bit lanes in this
	/// shuffle mask.
	///
	/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
	/// and we routinely test for these.
	static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
	int LaneSize = 128 / VT.getScalarSizeInBits();
	int Size = Mask.size();
	for (int i = 0; i < Size; ++i)
	if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
	return true;
	return false;
	}

	/// \brief Test whether a shuffle mask is equivalent within each sub-lane.
	///
	/// This checks a shuffle mask to see if it is performing the same
	/// lane-relative shuffle in each sub-lane. This trivially implies
	/// that it is also not lane-crossing. It may however involve a blend from the
	/// same lane of a second vector.
	///
	/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
	/// non-trivial to compute in the face of undef lanes. The representation is
	/// suitable for use with existing 128-bit shuffles as entries from the second
	/// vector have been remapped to [LaneSize, 2*LaneSize).
	static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
	ArrayRef<int> Mask,
	SmallVectorImpl<int> &RepeatedMask) {
	auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
	RepeatedMask.assign(LaneSize, -1);
	int Size = Mask.size();
	for (int i = 0; i < Size; ++i) {
	assert(Mask[i] == SM_SentinelUndef \|\| Mask[i] >= 0);
	if (Mask[i] < 0)
	continue;
	if ((Mask[i] % Size) / LaneSize != i / LaneSize)
	// This entry crosses lanes, so there is no way to model this shuffle.
	return false;

	// Ok, handle the in-lane shuffles by detecting if and when they repeat.
	// Adjust second vector indices to start at LaneSize instead of Size.
	int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
	: Mask[i] % LaneSize + LaneSize;
	if (RepeatedMask[i % LaneSize] < 0)
	// This is the first non-undef entry in this slot of a 128-bit lane.
	RepeatedMask[i % LaneSize] = LocalM;
	else if (RepeatedMask[i % LaneSize] != LocalM)
	// Found a mismatch with the repeated mask.
	return false;
	}
	return true;
	}

	/// Test whether a shuffle mask is equivalent within each 128-bit lane.
	static bool
	is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
	SmallVectorImpl<int> &RepeatedMask) {
	return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
	}

	/// Test whether a shuffle mask is equivalent within each 256-bit lane.
	static bool
	is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
	SmallVectorImpl<int> &RepeatedMask) {
	return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
	}

	/// Test whether a target shuffle mask is equivalent within each sub-lane.
	/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
	static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
	ArrayRef<int> Mask,
	SmallVectorImpl<int> &RepeatedMask) {
	int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
	RepeatedMask.assign(LaneSize, SM_SentinelUndef);
	int Size = Mask.size();
	for (int i = 0; i < Size; ++i) {
	assert(isUndefOrZero(Mask[i]) \|\| (Mask[i] >= 0));
	if (Mask[i] == SM_SentinelUndef)
	continue;
	if (Mask[i] == SM_SentinelZero) {
	if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
	return false;
	RepeatedMask[i % LaneSize] = SM_SentinelZero;
	continue;
	}
	if ((Mask[i] % Size) / LaneSize != i / LaneSize)
	// This entry crosses lanes, so there is no way to model this shuffle.
	return false;

	// Ok, handle the in-lane shuffles by detecting if and when they repeat.
	// Adjust second vector indices to start at LaneSize instead of Size.
	int LocalM =
	Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
	if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
	// This is the first non-undef entry in this slot of a 128-bit lane.
	RepeatedMask[i % LaneSize] = LocalM;
	else if (RepeatedMask[i % LaneSize] != LocalM)
	// Found a mismatch with the repeated mask.
	return false;
	}
	return true;
	}

	/// \brief Checks whether a shuffle mask is equivalent to an explicit list of
	/// arguments.
	///
	/// This is a fast way to test a shuffle mask against a fixed pattern:
	///
	/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
	///
	/// It returns true if the mask is exactly as wide as the argument list, and
	/// each element of the mask is either -1 (signifying undef) or the value given
	/// in the argument.
	static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
	ArrayRef<int> ExpectedMask) {
	if (Mask.size() != ExpectedMask.size())
	return false;

	int Size = Mask.size();

	// If the values are build vectors, we can look through them to find
	// equivalent inputs that make the shuffles equivalent.
	auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
	auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);

	for (int i = 0; i < Size; ++i) {
	assert(Mask[i] >= -1 && "Out of bound mask element!");
	if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
	auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
	auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
	if (!MaskBV \|\| !ExpectedBV \|\|
	MaskBV->getOperand(Mask[i] % Size) !=
	ExpectedBV->getOperand(ExpectedMask[i] % Size))
	return false;
	}
	}

	return true;
	}

	/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
	///
	/// The masks must be exactly the same width.
	///
	/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
	/// value in ExpectedMask is always accepted. Otherwise the indices must match.
	///
	/// SM_SentinelZero is accepted as a valid negative index but must match in both.
	static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
	ArrayRef<int> ExpectedMask) {
	int Size = Mask.size();
	if (Size != (int)ExpectedMask.size())
	return false;

	for (int i = 0; i < Size; ++i)
	if (Mask[i] == SM_SentinelUndef)
	continue;
	else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero)
	return false;
	else if (Mask[i] != ExpectedMask[i])
	return false;

	return true;
	}

	// Merges a general DAG shuffle mask and zeroable bit mask into a target shuffle
	// mask.
	static SmallVector<int, 64> createTargetShuffleMask(ArrayRef<int> Mask,
	const APInt &Zeroable) {
	int NumElts = Mask.size();
	assert(NumElts == (int)Zeroable.getBitWidth() && "Mismatch mask sizes");

	SmallVector<int, 64> TargetMask(NumElts, SM_SentinelUndef);
	for (int i = 0; i != NumElts; ++i) {
	int M = Mask[i];
	if (M == SM_SentinelUndef)
	continue;
	assert(0 <= M && M < (2 * NumElts) && "Out of range shuffle index");
	TargetMask[i] = (Zeroable[i] ? SM_SentinelZero : M);
	}
	return TargetMask;
	}

	// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
	// instructions.
	static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
	if (VT != MVT::v8i32 && VT != MVT::v8f32)
	return false;

	SmallVector<int, 8> Unpcklwd;
	createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
	/* Unary = */ false);
	SmallVector<int, 8> Unpckhwd;
	createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
	/* Unary = */ false);
	bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) \|\|
	isTargetShuffleEquivalent(Mask, Unpckhwd));
	return IsUnpackwdMask;
	}

	/// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
	///
	/// This helper function produces an 8-bit shuffle immediate corresponding to
	/// the ubiquitous shuffle encoding scheme used in x86 instructions for
	/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
	/// example.
	///
	/// NB: We rely heavily on "undef" masks preserving the input lane.
	static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
	assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
	assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
	assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
	assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
	assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");

	unsigned Imm = 0;
	Imm \|= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
	Imm \|= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
	Imm \|= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
	Imm \|= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
	return Imm;
	}

	static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
	SelectionDAG &DAG) {
	return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
	}

	/// \brief Compute whether each element of a shuffle is zeroable.
	///
	/// A "zeroable" vector shuffle element is one which can be lowered to zero.
	/// Either it is an undef element in the shuffle mask, the element of the input
	/// referenced is undef, or the element of the input referenced is known to be
	/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
	/// as many lanes with this technique as possible to simplify the remaining
	/// shuffle.
	static APInt computeZeroableShuffleElements(ArrayRef<int> Mask,
	SDValue V1, SDValue V2) {
	APInt Zeroable(Mask.size(), 0);
	V1 = peekThroughBitcasts(V1);
	V2 = peekThroughBitcasts(V2);

	bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
	bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());

	int VectorSizeInBits = V1.getValueSizeInBits();
	int ScalarSizeInBits = VectorSizeInBits / Mask.size();
	assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");

	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
	int M = Mask[i];
	// Handle the easy cases.
	if (M < 0 \|\| (M >= 0 && M < Size && V1IsZero) \|\| (M >= Size && V2IsZero)) {
	Zeroable.setBit(i);
	continue;
	}

	// Determine shuffle input and normalize the mask.
	SDValue V = M < Size ? V1 : V2;
	M %= Size;

	// Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
	if (V.getOpcode() != ISD::BUILD_VECTOR)
	continue;

	// If the BUILD_VECTOR has fewer elements then the bitcasted portion of
	// the (larger) source element must be UNDEF/ZERO.
	if ((Size % V.getNumOperands()) == 0) {
	int Scale = Size / V->getNumOperands();
	SDValue Op = V.getOperand(M / Scale);
	if (Op.isUndef() \|\| X86::isZeroNode(Op))
	Zeroable.setBit(i);
	else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
	APInt Val = Cst->getAPIntValue();
	Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
	Val = Val.getLoBits(ScalarSizeInBits);
	if (Val == 0)
	Zeroable.setBit(i);
	} else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
	APInt Val = Cst->getValueAPF().bitcastToAPInt();
	Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
	Val = Val.getLoBits(ScalarSizeInBits);
	if (Val == 0)
	Zeroable.setBit(i);
	}
	continue;
	}

	// If the BUILD_VECTOR has more elements then all the (smaller) source
	// elements must be UNDEF or ZERO.
	if ((V.getNumOperands() % Size) == 0) {
	int Scale = V->getNumOperands() / Size;
	bool AllZeroable = true;
	for (int j = 0; j < Scale; ++j) {
	SDValue Op = V.getOperand((M * Scale) + j);
	AllZeroable &= (Op.isUndef() \|\| X86::isZeroNode(Op));
	}
	if (AllZeroable)
	Zeroable.setBit(i);
	continue;
	}
	}

	return Zeroable;
	}

	// The Shuffle result is as follow:
	// 0a[0]0a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
	// Each Zeroable's element correspond to a particular Mask's element.
	// As described in computeZeroableShuffleElements function.
	//
	// The function looks for a sub-mask that the nonzero elements are in
	// increasing order. If such sub-mask exist. The function returns true.
	static bool isNonZeroElementsInOrder(const APInt &Zeroable,
	ArrayRef<int> Mask, const EVT &VectorType,
	bool &IsZeroSideLeft) {
	int NextElement = -1;
	// Check if the Mask's nonzero elements are in increasing order.
	for (int i = 0, e = Mask.size(); i < e; i++) {
	// Checks if the mask's zeros elements are built from only zeros.
	assert(Mask[i] >= -1 && "Out of bound mask element!");
	if (Mask[i] < 0)
	return false;
	if (Zeroable[i])
	continue;
	// Find the lowest non zero element
	if (NextElement < 0) {
	NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
	IsZeroSideLeft = NextElement != 0;
	}
	// Exit if the mask's non zero elements are not in increasing order.
	if (NextElement != Mask[i])
	return false;
	NextElement++;
	}
	return true;
	}

	/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
	static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
	ArrayRef<int> Mask, SDValue V1,
	SDValue V2,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	int Size = Mask.size();
	int LaneSize = 128 / VT.getScalarSizeInBits();
	const int NumBytes = VT.getSizeInBits() / 8;
	const int NumEltBytes = VT.getScalarSizeInBits() / 8;

	assert((Subtarget.hasSSSE3() && VT.is128BitVector()) \|\|
	(Subtarget.hasAVX2() && VT.is256BitVector()) \|\|
	(Subtarget.hasBWI() && VT.is512BitVector()));

	SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
	// Sign bit set in i8 mask means zero element.
	SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);

	SDValue V;
	for (int i = 0; i < NumBytes; ++i) {
	int M = Mask[i / NumEltBytes];
	if (M < 0) {
	PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
	continue;
	}
	if (Zeroable[i / NumEltBytes]) {
	PSHUFBMask[i] = ZeroMask;
	continue;
	}

	// We can only use a single input of V1 or V2.
	SDValue SrcV = (M >= Size ? V2 : V1);
	if (V && V != SrcV)
	return SDValue();
	V = SrcV;
	M %= Size;

	// PSHUFB can't cross lanes, ensure this doesn't happen.
	if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
	return SDValue();

	M = M % LaneSize;
	M = M * NumEltBytes + (i % NumEltBytes);
	PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
	}
	assert(V && "Failed to find a source input");

	MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
	return DAG.getBitcast(
	VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
	DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
	}

	static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
	const X86Subtarget &Subtarget, SelectionDAG &DAG,
	const SDLoc &dl);

	// X86 has dedicated shuffle that can be lowered to VEXPAND
	static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
	const APInt &Zeroable,
	ArrayRef<int> Mask, SDValue &V1,
	SDValue &V2, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	bool IsLeftZeroSide = true;
	if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
	IsLeftZeroSide))
	return SDValue();
	unsigned VEXPANDMask = (~Zeroable).getZExtValue();
	MVT IntegerType =
	MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
	SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
	unsigned NumElts = VT.getVectorNumElements();
	assert((NumElts == 4 \|\| NumElts == 8 \|\| NumElts == 16) &&
	"Unexpected number of vector elements");
	SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
	Subtarget, DAG, DL);
	SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
	SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
	return DAG.getSelect(DL, VT, VMask,
	DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
	ZeroVector);
	}

	static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
	unsigned &UnpackOpcode, bool IsUnary,
	ArrayRef<int> TargetMask, SDLoc &DL,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	int NumElts = VT.getVectorNumElements();

	bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
	for (int i = 0; i != NumElts; i += 2) {
	int M1 = TargetMask[i + 0];
	int M2 = TargetMask[i + 1];
	Undef1 &= (SM_SentinelUndef == M1);
	Undef2 &= (SM_SentinelUndef == M2);
	Zero1 &= isUndefOrZero(M1);
	Zero2 &= isUndefOrZero(M2);
	}
	assert(!((Undef1 \|\| Zero1) && (Undef2 \|\| Zero2)) &&
	"Zeroable shuffle detected");

	// Attempt to match the target mask against the unpack lo/hi mask patterns.
	SmallVector<int, 64> Unpckl, Unpckh;
	createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
	if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
	UnpackOpcode = X86ISD::UNPCKL;
	V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
	V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
	return true;
	}

	createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
	if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
	UnpackOpcode = X86ISD::UNPCKH;
	V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
	V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
	return true;
	}

	// If an unary shuffle, attempt to match as an unpack lo/hi with zero.
	if (IsUnary && (Zero1 \|\| Zero2)) {
	// Don't bother if we can blend instead.
	if ((Subtarget.hasSSE41() \|\| VT == MVT::v2i64 \|\| VT == MVT::v2f64) &&
	isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
	return false;

	bool MatchLo = true, MatchHi = true;
	for (int i = 0; (i != NumElts) && (MatchLo \|\| MatchHi); ++i) {
	int M = TargetMask[i];

	// Ignore if the input is known to be zero or the index is undef.
	if ((((i & 1) == 0) && Zero1) \|\| (((i & 1) == 1) && Zero2) \|\|
	(M == SM_SentinelUndef))
	continue;

	MatchLo &= (M == Unpckl[i]);
	MatchHi &= (M == Unpckh[i]);
	}

	if (MatchLo \|\| MatchHi) {
	UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
	V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
	V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
	return true;
	}
	}

	// If a binary shuffle, commute and try again.
	if (!IsUnary) {
	ShuffleVectorSDNode::commuteMask(Unpckl);
	if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
	UnpackOpcode = X86ISD::UNPCKL;
	std::swap(V1, V2);
	return true;
	}

	ShuffleVectorSDNode::commuteMask(Unpckh);
	if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
	UnpackOpcode = X86ISD::UNPCKH;
	std::swap(V1, V2);
	return true;
	}
	}

	return false;
	}

	// X86 has dedicated unpack instructions that can handle specific blend
	// operations: UNPCKH and UNPCKL.
	static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
	ArrayRef<int> Mask, SDValue V1,
	SDValue V2, SelectionDAG &DAG) {
	SmallVector<int, 8> Unpckl;
	createUnpackShuffleMask(VT, Unpckl, /* Lo = / true, / Unary = */ false);
	if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
	return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);

	SmallVector<int, 8> Unpckh;
	createUnpackShuffleMask(VT, Unpckh, /* Lo = / false, / Unary = */ false);
	if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
	return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);

	// Commute and try again.
	ShuffleVectorSDNode::commuteMask(Unpckl);
	if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
	return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);

	ShuffleVectorSDNode::commuteMask(Unpckh);
	if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
	return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);

	return SDValue();
	}

	/// \brief Try to emit a bitmask instruction for a shuffle.
	///
	/// This handles cases where we can model a blend exactly as a bitmask due to
	/// one of the inputs being zeroable.
	static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SelectionDAG &DAG) {
	assert(!VT.isFloatingPoint() && "Floating point types are not supported");
	MVT EltVT = VT.getVectorElementType();
	SDValue Zero = DAG.getConstant(0, DL, EltVT);
	SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
	SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
	SDValue V;
	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
	if (Zeroable[i])
	continue;
	if (Mask[i] % Size != i)
	return SDValue(); // Not a blend.
	if (!V)
	V = Mask[i] < Size ? V1 : V2;
	else if (V != (Mask[i] < Size ? V1 : V2))
	return SDValue(); // Can only let one input through the mask.

	VMaskOps[i] = AllOnes;
	}
	if (!V)
	return SDValue(); // No non-zeroable elements!

	SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps);
	return DAG.getNode(ISD::AND, DL, VT, V, VMask);
	}

	/// \brief Try to emit a blend instruction for a shuffle using bit math.
	///
	/// This is used as a fallback approach when first class blend instructions are
	/// unavailable. Currently it is only suitable for integer vectors, but could
	/// be generalized for floating point vectors if desirable.
	static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	assert(VT.isInteger() && "Only supports integer vector types!");
	MVT EltVT = VT.getVectorElementType();
	SDValue Zero = DAG.getConstant(0, DL, EltVT);
	SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
	SmallVector<SDValue, 16> MaskOps;
	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
	if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
	return SDValue(); // Shuffled input!
	MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
	}

	SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
	V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
	// We have to cast V2 around.
	MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
	V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
	DAG.getBitcast(MaskVT, V1Mask),
	DAG.getBitcast(MaskVT, V2)));
	return DAG.getNode(ISD::OR, DL, VT, V1, V2);
	}

	static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
	SDValue PreservedSrc,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG);

	static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2,
	MutableArrayRef<int> TargetMask,
	bool &ForceV1Zero, bool &ForceV2Zero,
	uint64_t &BlendMask) {
	bool V1IsZeroOrUndef =
	V1.isUndef() \|\| ISD::isBuildVectorAllZeros(V1.getNode());
	bool V2IsZeroOrUndef =
	V2.isUndef() \|\| ISD::isBuildVectorAllZeros(V2.getNode());

	BlendMask = 0;
	ForceV1Zero = false, ForceV2Zero = false;
	assert(TargetMask.size() <= 64 && "Shuffle mask too big for blend mask");

	// Attempt to generate the binary blend mask. If an input is zero then
	// we can use any lane.
	// TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
	for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
	int M = TargetMask[i];
	if (M == SM_SentinelUndef)
	continue;
	if (M == i)
	continue;
	if (M == i + Size) {
	BlendMask \|= 1ull << i;
	continue;
	}
	if (M == SM_SentinelZero) {
	if (V1IsZeroOrUndef) {
	ForceV1Zero = true;
	TargetMask[i] = i;
	continue;
	}
	if (V2IsZeroOrUndef) {
	ForceV2Zero = true;
	BlendMask \|= 1ull << i;
	TargetMask[i] = i + Size;
	continue;
	}
	}
	return false;
	}
	return true;
	}

	uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size, int Scale) {
	uint64_t ScaledMask = 0;
	for (int i = 0; i != Size; ++i)
	if (BlendMask & (1ull << i))
	ScaledMask \|= ((1ull << Scale) - 1) << (i * Scale);
	return ScaledMask;
	}

	/// \brief Try to emit a blend instruction for a shuffle.
	///
	/// This doesn't do any checks for the availability of instructions for blending
	/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
	/// be matched in the backend with the type given. What it does check for is
	/// that the shuffle mask is a blend, or convertible into a blend with zero.
	static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Original,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SmallVector<int, 64> Mask = createTargetShuffleMask(Original, Zeroable);

	uint64_t BlendMask = 0;
	bool ForceV1Zero = false, ForceV2Zero = false;
	if (!matchVectorShuffleAsBlend(V1, V2, Mask, ForceV1Zero, ForceV2Zero,
	BlendMask))
	return SDValue();

	// Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
	if (ForceV1Zero)
	V1 = getZeroVector(VT, Subtarget, DAG, DL);
	if (ForceV2Zero)
	V2 = getZeroVector(VT, Subtarget, DAG, DL);

	switch (VT.SimpleTy) {
	case MVT::v2f64:
	case MVT::v4f32:
	case MVT::v4f64:
	case MVT::v8f32:
	return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
	DAG.getConstant(BlendMask, DL, MVT::i8));

	case MVT::v4i64:
	case MVT::v8i32:
	assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
	LLVM_FALLTHROUGH;
	case MVT::v2i64:
	case MVT::v4i32:
	// If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
	// that instruction.
	if (Subtarget.hasAVX2()) {
	// Scale the blend by the number of 32-bit dwords per element.
	int Scale = VT.getScalarSizeInBits() / 32;
	BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
	MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
	V1 = DAG.getBitcast(BlendVT, V1);
	V2 = DAG.getBitcast(BlendVT, V2);
	return DAG.getBitcast(
	VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
	DAG.getConstant(BlendMask, DL, MVT::i8)));
	}
	LLVM_FALLTHROUGH;
	case MVT::v8i16: {
	// For integer shuffles we need to expand the mask and cast the inputs to
	// v8i16s prior to blending.
	int Scale = 8 / VT.getVectorNumElements();
	BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
	V1 = DAG.getBitcast(MVT::v8i16, V1);
	V2 = DAG.getBitcast(MVT::v8i16, V2);
	return DAG.getBitcast(VT,
	DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
	DAG.getConstant(BlendMask, DL, MVT::i8)));
	}

	case MVT::v16i16: {
	assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
	SmallVector<int, 8> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
	// We can lower these with PBLENDW which is mirrored across 128-bit lanes.
	assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
	BlendMask = 0;
	for (int i = 0; i < 8; ++i)
	if (RepeatedMask[i] >= 8)
	BlendMask \|= 1ull << i;
	return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
	DAG.getConstant(BlendMask, DL, MVT::i8));
	}
	LLVM_FALLTHROUGH;
	}
	case MVT::v16i8:
	case MVT::v32i8: {
	assert((VT.is128BitVector() \|\| Subtarget.hasAVX2()) &&
	"256-bit byte-blends require AVX2 support!");

	if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
	MVT IntegerType =
	MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
	SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
	return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
	}

	// Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
	if (SDValue Masked =
	lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
	return Masked;

	// Scale the blend by the number of bytes per element.
	int Scale = VT.getScalarSizeInBits() / 8;

	// This form of blend is always done on bytes. Compute the byte vector
	// type.
	MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);

	// Compute the VSELECT mask. Note that VSELECT is really confusing in the
	// mix of LLVM's code generator and the x86 backend. We tell the code
	// generator that boolean values in the elements of an x86 vector register
	// are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
	// mapping a select to operand #1, and 'false' mapping to operand #2. The
	// reality in x86 is that vector masks (pre-AVX-512) use only the high bit
	// of the element (the remaining are ignored) and 0 in that high bit would
	// mean operand #1 while 1 in the high bit would mean operand #2. So while
	// the LLVM model for boolean values in vector elements gets the relevant
	// bit set, it is set backwards and over constrained relative to x86's
	// actual model.
	SmallVector<SDValue, 32> VSELECTMask;
	for (int i = 0, Size = Mask.size(); i < Size; ++i)
	for (int j = 0; j < Scale; ++j)
	VSELECTMask.push_back(
	Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
	: DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
	MVT::i8));

	V1 = DAG.getBitcast(BlendVT, V1);
	V2 = DAG.getBitcast(BlendVT, V2);
	return DAG.getBitcast(
	VT,
	DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
	V1, V2));
	}
	case MVT::v16f32:
	case MVT::v8f64:
	case MVT::v8i64:
	case MVT::v16i32:
	case MVT::v32i16:
	case MVT::v64i8: {
	MVT IntegerType =
	MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
	SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
	return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
	}
	default:
	llvm_unreachable("Not a supported integer vector type!");
	}
	}

	/// \brief Try to lower as a blend of elements from two inputs followed by
	/// a single-input permutation.
	///
	/// This matches the pattern where we can blend elements from two inputs and
	/// then reduce the shuffle to a single-input permutation.
	static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
	SDValue V1, SDValue V2,
	ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	// We build up the blend mask while checking whether a blend is a viable way
	// to reduce the shuffle.
	SmallVector<int, 32> BlendMask(Mask.size(), -1);
	SmallVector<int, 32> PermuteMask(Mask.size(), -1);

	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
	if (Mask[i] < 0)
	continue;

	assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");

	if (BlendMask[Mask[i] % Size] < 0)
	BlendMask[Mask[i] % Size] = Mask[i];
	else if (BlendMask[Mask[i] % Size] != Mask[i])
	return SDValue(); // Can't blend in the needed input!

	PermuteMask[i] = Mask[i] % Size;
	}

	SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
	return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
	}

	/// \brief Generic routine to decompose a shuffle and blend into independent
	/// blends and permutes.
	///
	/// This matches the extremely common pattern for handling combined
	/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
	/// operations. It will try to pick the best arrangement of shuffles and
	/// blends.
	static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL,
	MVT VT, SDValue V1,
	SDValue V2,
	ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	// Shuffle the input elements into the desired positions in V1 and V2 and
	// blend them together.
	SmallVector<int, 32> V1Mask(Mask.size(), -1);
	SmallVector<int, 32> V2Mask(Mask.size(), -1);
	SmallVector<int, 32> BlendMask(Mask.size(), -1);
	for (int i = 0, Size = Mask.size(); i < Size; ++i)
	if (Mask[i] >= 0 && Mask[i] < Size) {
	V1Mask[i] = Mask[i];
	BlendMask[i] = i;
	} else if (Mask[i] >= Size) {
	V2Mask[i] = Mask[i] - Size;
	BlendMask[i] = i + Size;
	}

	// Try to lower with the simpler initial blend strategy unless one of the
	// input shuffles would be a no-op. We prefer to shuffle inputs as the
	// shuffle may be able to fold with a load or other benefit. However, when
	// we'll have to do 2x as many shuffles in order to achieve this, blending
	// first is a better strategy.
	if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
	if (SDValue BlendPerm =
	lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
	return BlendPerm;

	V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
	V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
	return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
	}

	/// \brief Try to lower a vector shuffle as a rotation.
	///
	/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
	static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2,
	ArrayRef<int> Mask) {
	int NumElts = Mask.size();

	// We need to detect various ways of spelling a rotation:
	// [11, 12, 13, 14, 15, 0, 1, 2]
	// [-1, 12, 13, 14, -1, -1, 1, -1]
	// [-1, -1, -1, -1, -1, -1, 1, 2]
	// [ 3, 4, 5, 6, 7, 8, 9, 10]
	// [-1, 4, 5, 6, -1, -1, 9, -1]
	// [-1, 4, 5, 6, -1, -1, -1, -1]
	int Rotation = 0;
	SDValue Lo, Hi;
	for (int i = 0; i < NumElts; ++i) {
	int M = Mask[i];
	assert((M == SM_SentinelUndef \|\| (0 <= M && M < (2*NumElts))) &&
	"Unexpected mask index.");
	if (M < 0)
	continue;

	// Determine where a rotated vector would have started.
	int StartIdx = i - (M % NumElts);
	if (StartIdx == 0)
	// The identity rotation isn't interesting, stop.
	return -1;

	// If we found the tail of a vector the rotation must be the missing
	// front. If we found the head of a vector, it must be how much of the
	// head.
	int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;

	if (Rotation == 0)
	Rotation = CandidateRotation;
	else if (Rotation != CandidateRotation)
	// The rotations don't match, so we can't match this mask.
	return -1;

	// Compute which value this mask is pointing at.
	SDValue MaskV = M < NumElts ? V1 : V2;

	// Compute which of the two target values this index should be assigned
	// to. This reflects whether the high elements are remaining or the low
	// elements are remaining.
	SDValue &TargetV = StartIdx < 0 ? Hi : Lo;

	// Either set up this value if we've not encountered it before, or check
	// that it remains consistent.
	if (!TargetV)
	TargetV = MaskV;
	else if (TargetV != MaskV)
	// This may be a rotation, but it pulls from the inputs in some
	// unsupported interleaving.
	return -1;
	}

	// Check that we successfully analyzed the mask, and normalize the results.
	assert(Rotation != 0 && "Failed to locate a viable rotation!");
	assert((Lo \|\| Hi) && "Failed to find a rotated input vector!");
	if (!Lo)
	Lo = Hi;
	else if (!Hi)
	Hi = Lo;

	V1 = Lo;
	V2 = Hi;

	return Rotation;
	}

	/// \brief Try to lower a vector shuffle as a byte rotation.
	///
	/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
	/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
	/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
	/// try to generically lower a vector shuffle through such an pattern. It
	/// does not check for the profitability of lowering either as PALIGNR or
	/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
	/// This matches shuffle vectors that look like:
	///
	/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
	///
	/// Essentially it concatenates V1 and V2, shifts right by some number of
	/// elements, and takes the low elements as the result. Note that while this is
	/// specified as a right shift because x86 is little-endian, it is a *left
	/// rotate* of the vector lanes.
	static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
	ArrayRef<int> Mask) {
	// Don't accept any shuffles with zero elements.
	if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
	return -1;

	// PALIGNR works on 128-bit lanes.
	SmallVector<int, 16> RepeatedMask;
	if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
	return -1;

	int Rotation = matchVectorShuffleAsRotate(V1, V2, RepeatedMask);
	if (Rotation <= 0)
	return -1;

	// PALIGNR rotates bytes, so we need to scale the
	// rotation based on how many bytes are in the vector lane.
	int NumElts = RepeatedMask.size();
	int Scale = 16 / NumElts;
	return Rotation * Scale;
	}

	static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
	SDValue V1, SDValue V2,
	ArrayRef<int> Mask,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");

	SDValue Lo = V1, Hi = V2;
	int ByteRotation = matchVectorShuffleAsByteRotate(VT, Lo, Hi, Mask);
	if (ByteRotation <= 0)
	return SDValue();

	// Cast the inputs to i8 vector of correct length to match PALIGNR or
	// PSLLDQ/PSRLDQ.
	MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
	Lo = DAG.getBitcast(ByteVT, Lo);
	Hi = DAG.getBitcast(ByteVT, Hi);

	// SSSE3 targets can use the palignr instruction.
	if (Subtarget.hasSSSE3()) {
	assert((!VT.is512BitVector() \|\| Subtarget.hasBWI()) &&
	"512-bit PALIGNR requires BWI instructions");
	return DAG.getBitcast(
	VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
	DAG.getConstant(ByteRotation, DL, MVT::i8)));
	}

	assert(VT.is128BitVector() &&
	"Rotate-based lowering only supports 128-bit lowering!");
	assert(Mask.size() <= 16 &&
	"Can shuffle at most 16 bytes in a 128-bit vector!");
	assert(ByteVT == MVT::v16i8 &&
	"SSE2 rotate lowering only needed for v16i8!");

	// Default SSE2 implementation
	int LoByteShift = 16 - ByteRotation;
	int HiByteShift = ByteRotation;

	SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
	DAG.getConstant(LoByteShift, DL, MVT::i8));
	SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
	DAG.getConstant(HiByteShift, DL, MVT::i8));
	return DAG.getBitcast(VT,
	DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
	}

	/// \brief Try to lower a vector shuffle as a dword/qword rotation.
	///
	/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
	/// rotation of the concatenation of two vectors; This routine will
	/// try to generically lower a vector shuffle through such an pattern.
	///
	/// Essentially it concatenates V1 and V2, shifts right by some number of
	/// elements, and takes the low elements as the result. Note that while this is
	/// specified as a right shift because x86 is little-endian, it is a *left
	/// rotate* of the vector lanes.
	static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
	SDValue V1, SDValue V2,
	ArrayRef<int> Mask,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert((VT.getScalarType() == MVT::i32 \|\| VT.getScalarType() == MVT::i64) &&
	"Only 32-bit and 64-bit elements are supported!");

	// 128/256-bit vectors are only supported with VLX.
	assert((Subtarget.hasVLX() \|\| (!VT.is128BitVector() && !VT.is256BitVector()))
	&& "VLX required for 128/256-bit vectors");

	SDValue Lo = V1, Hi = V2;
	int Rotation = matchVectorShuffleAsRotate(Lo, Hi, Mask);
	if (Rotation <= 0)
	return SDValue();

	return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
	DAG.getConstant(Rotation, DL, MVT::i8));
	}

	/// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
	///
	/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
	/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
	/// matches elements from one of the input vectors shuffled to the left or
	/// right with zeroable elements 'shifted in'. It handles both the strictly
	/// bit-wise element shifts and the byte shift across an entire 128-bit double
	/// quad word lane.
	///
	/// PSHL : (little-endian) left bit shift.
	/// [ zz, 0, zz, 2 ]
	/// [ -1, 4, zz, -1 ]
	/// PSRL : (little-endian) right bit shift.
	/// [ 1, zz, 3, zz]
	/// [ -1, -1, 7, zz]
	/// PSLLDQ : (little-endian) left byte shift
	/// [ zz, 0, 1, 2, 3, 4, 5, 6]
	/// [ zz, zz, -1, -1, 2, 3, 4, -1]
	/// [ zz, zz, zz, zz, zz, zz, -1, 1]
	/// PSRLDQ : (little-endian) right byte shift
	/// [ 5, 6, 7, zz, zz, zz, zz, zz]
	/// [ -1, 5, 6, 7, zz, zz, zz, zz]
	/// [ 1, 2, -1, -1, -1, -1, zz, zz]
	static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
	unsigned ScalarSizeInBits,
	ArrayRef<int> Mask, int MaskOffset,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget) {
	int Size = Mask.size();
	unsigned SizeInBits = Size * ScalarSizeInBits;

	auto CheckZeros = [&](int Shift, int Scale, bool Left) {
	for (int i = 0; i < Size; i += Scale)
	for (int j = 0; j < Shift; ++j)
	if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
	return false;

	return true;
	};

	auto MatchShift = [&](int Shift, int Scale, bool Left) {
	for (int i = 0; i != Size; i += Scale) {
	unsigned Pos = Left ? i + Shift : i;
	unsigned Low = Left ? i : i + Shift;
	unsigned Len = Scale - Shift;
	if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
	return -1;
	}

	int ShiftEltBits = ScalarSizeInBits * Scale;
	bool ByteShift = ShiftEltBits > 64;
	Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
	: (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
	int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);

	// Normalize the scale for byte shifts to still produce an i64 element
	// type.
	Scale = ByteShift ? Scale / 2 : Scale;

	// We need to round trip through the appropriate type for the shift.
	MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
	ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
	: MVT::getVectorVT(ShiftSVT, Size / Scale);
	return (int)ShiftAmt;
	};

	// SSE/AVX supports logical shifts up to 64-bit integers - so we can just
	// keep doubling the size of the integer elements up to that. We can
	// then shift the elements of the integer vector by whole multiples of
	// their width within the elements of the larger integer vector. Test each
	// multiple to see if we can find a match with the moved element indices
	// and that the shifted in elements are all zeroable.
	unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
	for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
	for (int Shift = 1; Shift != Scale; ++Shift)
	for (bool Left : {true, false})
	if (CheckZeros(Shift, Scale, Left)) {
	int ShiftAmt = MatchShift(Shift, Scale, Left);
	if (0 < ShiftAmt)
	return ShiftAmt;
	}

	// no match
	return -1;
	}

	static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	int Size = Mask.size();
	assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");

	MVT ShiftVT;
	SDValue V = V1;
	unsigned Opcode;

	// Try to match shuffle against V1 shift.
	int ShiftAmt = matchVectorShuffleAsShift(
	ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget);

	// If V1 failed, try to match shuffle against V2 shift.
	if (ShiftAmt < 0) {
	ShiftAmt =
	matchVectorShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
	Mask, Size, Zeroable, Subtarget);
	V = V2;
	}

	if (ShiftAmt < 0)
	return SDValue();

	assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
	"Illegal integer vector type");
	V = DAG.getBitcast(ShiftVT, V);
	V = DAG.getNode(Opcode, DL, ShiftVT, V,
	DAG.getConstant(ShiftAmt, DL, MVT::i8));
	return DAG.getBitcast(VT, V);
	}

	// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
	// Remainder of lower half result is zero and upper half is all undef.
	static bool matchVectorShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
	ArrayRef<int> Mask, uint64_t &BitLen,
	uint64_t &BitIdx, const APInt &Zeroable) {
	int Size = Mask.size();
	int HalfSize = Size / 2;
	assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
	assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask");

	// Upper half must be undefined.
	if (!isUndefInRange(Mask, HalfSize, HalfSize))
	return false;

	// Determine the extraction length from the part of the
	// lower half that isn't zeroable.
	int Len = HalfSize;
	for (; Len > 0; --Len)
	if (!Zeroable[Len - 1])
	break;
	assert(Len > 0 && "Zeroable shuffle mask");

	// Attempt to match first Len sequential elements from the lower half.
	SDValue Src;
	int Idx = -1;
	for (int i = 0; i != Len; ++i) {
	int M = Mask[i];
	if (M == SM_SentinelUndef)
	continue;
	SDValue &V = (M < Size ? V1 : V2);
	M = M % Size;

	// The extracted elements must start at a valid index and all mask
	// elements must be in the lower half.
	if (i > M \|\| M >= HalfSize)
	return false;

	if (Idx < 0 \|\| (Src == V && Idx == (M - i))) {
	Src = V;
	Idx = M - i;
	continue;
	}
	return false;
	}

	if (!Src \|\| Idx < 0)
	return false;

	assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
	BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
	BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
	V1 = Src;
	return true;
	}

	// INSERTQ: Extract lowest Len elements from lower half of second source and
	// insert over first source, starting at Idx.
	// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
	static bool matchVectorShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
	ArrayRef<int> Mask, uint64_t &BitLen,
	uint64_t &BitIdx) {
	int Size = Mask.size();
	int HalfSize = Size / 2;
	assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");

	// Upper half must be undefined.
	if (!isUndefInRange(Mask, HalfSize, HalfSize))
	return false;

	for (int Idx = 0; Idx != HalfSize; ++Idx) {
	SDValue Base;

	// Attempt to match first source from mask before insertion point.
	if (isUndefInRange(Mask, 0, Idx)) {
	/* EMPTY */
	} else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
	Base = V1;
	} else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
	Base = V2;
	} else {
	continue;
	}

	// Extend the extraction length looking to match both the insertion of
	// the second source and the remaining elements of the first.
	for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
	SDValue Insert;
	int Len = Hi - Idx;

	// Match insertion.
	if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
	Insert = V1;
	} else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
	Insert = V2;
	} else {
	continue;
	}

	// Match the remaining elements of the lower half.
	if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
	/* EMPTY */
	} else if ((!Base \|\| (Base == V1)) &&
	isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
	Base = V1;
	} else if ((!Base \|\| (Base == V2)) &&
	isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
	Size + Hi)) {
	Base = V2;
	} else {
	continue;
	}

	BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
	BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
	V1 = Base;
	V2 = Insert;
	return true;
	}
	}

	return false;
	}

	/// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
	static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SelectionDAG &DAG) {
	uint64_t BitLen, BitIdx;
	if (matchVectorShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
	return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
	DAG.getConstant(BitLen, DL, MVT::i8),
	DAG.getConstant(BitIdx, DL, MVT::i8));

	if (matchVectorShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
	return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
	V2 ? V2 : DAG.getUNDEF(VT),
	DAG.getConstant(BitLen, DL, MVT::i8),
	DAG.getConstant(BitIdx, DL, MVT::i8));

	return SDValue();
	}

	/// \brief Lower a vector shuffle as a zero or any extension.
	///
	/// Given a specific number of elements, element bit width, and extension
	/// stride, produce either a zero or any extension based on the available
	/// features of the subtarget. The extended elements are consecutive and
	/// begin and can start from an offsetted element index in the input; to
	/// avoid excess shuffling the offset must either being in the bottom lane
	/// or at the start of a higher lane. All extended elements must be from
	/// the same lane.
	static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
	const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
	ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	assert(Scale > 1 && "Need a scale to extend.");
	int EltBits = VT.getScalarSizeInBits();
	int NumElements = VT.getVectorNumElements();
	int NumEltsPerLane = 128 / EltBits;
	int OffsetLane = Offset / NumEltsPerLane;
	assert((EltBits == 8 \|\| EltBits == 16 \|\| EltBits == 32) &&
	"Only 8, 16, and 32 bit elements can be extended.");
	assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
	assert(0 <= Offset && "Extension offset must be positive.");
	assert((Offset < NumEltsPerLane \|\| Offset % NumEltsPerLane == 0) &&
	"Extension offset must be in the first lane or start an upper lane.");

	// Check that an index is in same lane as the base offset.
	auto SafeOffset = [&](int Idx) {
	return OffsetLane == (Idx / NumEltsPerLane);
	};

	// Shift along an input so that the offset base moves to the first element.
	auto ShuffleOffset = [&](SDValue V) {
	if (!Offset)
	return V;

	SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
	for (int i = 0; i * Scale < NumElements; ++i) {
	int SrcIdx = i + Offset;
	ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
	}
	return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
	};

	// Found a valid zext mask! Try various lowering strategies based on the
	// input type and available ISA extensions.
	if (Subtarget.hasSSE41()) {
	// Not worth offsetting 128-bit vectors if scale == 2, a pattern using
	// PUNPCK will catch this in a later shuffle match.
	if (Offset && Scale == 2 && VT.is128BitVector())
	return SDValue();
	MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
	NumElements / Scale);
	InputV = ShuffleOffset(InputV);
	InputV = getExtendInVec(X86ISD::VZEXT, DL, ExtVT, InputV, DAG);
	return DAG.getBitcast(VT, InputV);
	}

	assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");

	// For any extends we can cheat for larger element sizes and use shuffle
	// instructions that can fold with a load and/or copy.
	if (AnyExt && EltBits == 32) {
	int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
	-1};
	return DAG.getBitcast(
	VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
	DAG.getBitcast(MVT::v4i32, InputV),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
	}
	if (AnyExt && EltBits == 16 && Scale > 2) {
	int PSHUFDMask[4] = {Offset / 2, -1,
	SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
	InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
	DAG.getBitcast(MVT::v4i32, InputV),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
	int PSHUFWMask[4] = {1, -1, -1, -1};
	unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW);
	return DAG.getBitcast(
	VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
	DAG.getBitcast(MVT::v8i16, InputV),
	getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
	}

	// The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
	// to 64-bits.
	if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
	assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
	assert(VT.is128BitVector() && "Unexpected vector width!");

	int LoIdx = Offset * EltBits;
	SDValue Lo = DAG.getBitcast(
	MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
	DAG.getConstant(EltBits, DL, MVT::i8),
	DAG.getConstant(LoIdx, DL, MVT::i8)));

	if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) \|\|
	!SafeOffset(Offset + 1))
	return DAG.getBitcast(VT, Lo);

	int HiIdx = (Offset + 1) * EltBits;
	SDValue Hi = DAG.getBitcast(
	MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
	DAG.getConstant(EltBits, DL, MVT::i8),
	DAG.getConstant(HiIdx, DL, MVT::i8)));
	return DAG.getBitcast(VT,
	DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
	}

	// If this would require more than 2 unpack instructions to expand, use
	// pshufb when available. We can only use more than 2 unpack instructions
	// when zero extending i8 elements which also makes it easier to use pshufb.
	if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
	assert(NumElements == 16 && "Unexpected byte vector width!");
	SDValue PSHUFBMask[16];
	for (int i = 0; i < 16; ++i) {
	int Idx = Offset + (i / Scale);
	PSHUFBMask[i] = DAG.getConstant(
	(i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
	}
	InputV = DAG.getBitcast(MVT::v16i8, InputV);
	return DAG.getBitcast(
	VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
	DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
	}

	// If we are extending from an offset, ensure we start on a boundary that
	// we can unpack from.
	int AlignToUnpack = Offset % (NumElements / Scale);
	if (AlignToUnpack) {
	SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
	for (int i = AlignToUnpack; i < NumElements; ++i)
	ShMask[i - AlignToUnpack] = i;
	InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
	Offset -= AlignToUnpack;
	}

	// Otherwise emit a sequence of unpacks.
	do {
	unsigned UnpackLoHi = X86ISD::UNPCKL;
	if (Offset >= (NumElements / 2)) {
	UnpackLoHi = X86ISD::UNPCKH;
	Offset -= (NumElements / 2);
	}

	MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
	SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
	: getZeroVector(InputVT, Subtarget, DAG, DL);
	InputV = DAG.getBitcast(InputVT, InputV);
	InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
	Scale /= 2;
	EltBits *= 2;
	NumElements /= 2;
	} while (Scale > 1);
	return DAG.getBitcast(VT, InputV);
	}

	/// \brief Try to lower a vector shuffle as a zero extension on any microarch.
	///
	/// This routine will try to do everything in its power to cleverly lower
	/// a shuffle which happens to match the pattern of a zero extend. It doesn't
	/// check for the profitability of this lowering, it tries to aggressively
	/// match this pattern. It will use all of the micro-architectural details it
	/// can to emit an efficient lowering. It handles both blends with all-zero
	/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
	/// masking out later).
	///
	/// The reason we have dedicated lowering for zext-style shuffles is that they
	/// are both incredibly common and often quite performance sensitive.
	static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	int Bits = VT.getSizeInBits();
	int NumLanes = Bits / 128;
	int NumElements = VT.getVectorNumElements();
	int NumEltsPerLane = NumElements / NumLanes;
	assert(VT.getScalarSizeInBits() <= 32 &&
	"Exceeds 32-bit integer zero extension limit");
	assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");

	// Define a helper function to check a particular ext-scale and lower to it if
	// valid.
	auto Lower = [&](int Scale) -> SDValue {
	SDValue InputV;
	bool AnyExt = true;
	int Offset = 0;
	int Matches = 0;
	for (int i = 0; i < NumElements; ++i) {
	int M = Mask[i];
	if (M < 0)
	continue; // Valid anywhere but doesn't tell us anything.
	if (i % Scale != 0) {
	// Each of the extended elements need to be zeroable.
	if (!Zeroable[i])
	return SDValue();

	// We no longer are in the anyext case.
	AnyExt = false;
	continue;
	}

	// Each of the base elements needs to be consecutive indices into the
	// same input vector.
	SDValue V = M < NumElements ? V1 : V2;
	M = M % NumElements;
	if (!InputV) {
	InputV = V;
	Offset = M - (i / Scale);
	} else if (InputV != V)
	return SDValue(); // Flip-flopping inputs.

	// Offset must start in the lowest 128-bit lane or at the start of an
	// upper lane.
	// FIXME: Is it ever worth allowing a negative base offset?
	if (!((0 <= Offset && Offset < NumEltsPerLane) \|\|
	(Offset % NumEltsPerLane) == 0))
	return SDValue();

	// If we are offsetting, all referenced entries must come from the same
	// lane.
	if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
	return SDValue();

	if ((M % NumElements) != (Offset + (i / Scale)))
	return SDValue(); // Non-consecutive strided elements.
	Matches++;
	}

	// If we fail to find an input, we have a zero-shuffle which should always
	// have already been handled.
	// FIXME: Maybe handle this here in case during blending we end up with one?
	if (!InputV)
	return SDValue();

	// If we are offsetting, don't extend if we only match a single input, we
	// can always do better by using a basic PSHUF or PUNPCK.
	if (Offset != 0 && Matches < 2)
	return SDValue();

	return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
	DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);
	};

	// The widest scale possible for extending is to a 64-bit integer.
	assert(Bits % 64 == 0 &&
	"The number of bits in a vector must be divisible by 64 on x86!");
	int NumExtElements = Bits / 64;

	// Each iteration, try extending the elements half as much, but into twice as
	// many elements.
	for (; NumExtElements < NumElements; NumExtElements *= 2) {
	assert(NumElements % NumExtElements == 0 &&
	"The input vector size must be divisible by the extended size.");
	if (SDValue V = Lower(NumElements / NumExtElements))
	return V;
	}

	// General extends failed, but 128-bit vectors may be able to use MOVQ.
	if (Bits != 128)
	return SDValue();

	// Returns one of the source operands if the shuffle can be reduced to a
	// MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
	auto CanZExtLowHalf = [&]() {
	for (int i = NumElements / 2; i != NumElements; ++i)
	if (!Zeroable[i])
	return SDValue();
	if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
	return V1;
	if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
	return V2;
	return SDValue();
	};

	if (SDValue V = CanZExtLowHalf()) {
	V = DAG.getBitcast(MVT::v2i64, V);
	V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
	return DAG.getBitcast(VT, V);
	}

	// No viable ext lowering found.
	return SDValue();
	}

	/// \brief Try to get a scalar value for a specific element of a vector.
	///
	/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
	static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
	SelectionDAG &DAG) {
	MVT VT = V.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	V = peekThroughBitcasts(V);

	// If the bitcasts shift the element size, we can't extract an equivalent
	// element from it.
	MVT NewVT = V.getSimpleValueType();
	if (!NewVT.isVector() \|\| NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
	return SDValue();

	if (V.getOpcode() == ISD::BUILD_VECTOR \|\|
	(Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
	// Ensure the scalar operand is the same size as the destination.
	// FIXME: Add support for scalar truncation where possible.
	SDValue S = V.getOperand(Idx);
	if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
	return DAG.getBitcast(EltVT, S);
	}

	return SDValue();
	}

	/// \brief Helper to test for a load that can be folded with x86 shuffles.
	///
	/// This is particularly important because the set of instructions varies
	/// significantly based on whether the operand is a load or not.
	static bool isShuffleFoldableLoad(SDValue V) {
	V = peekThroughBitcasts(V);
	return ISD::isNON_EXTLoad(V.getNode());
	}

	/// \brief Try to lower insertion of a single element into a zero vector.
	///
	/// This is a common pattern that we have especially efficient patterns to lower
	/// across all subtarget feature sets.
	static SDValue lowerVectorShuffleAsElementInsertion(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT ExtVT = VT;
	MVT EltVT = VT.getVectorElementType();

	int V2Index =
	find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
	Mask.begin();
	bool IsV1Zeroable = true;
	for (int i = 0, Size = Mask.size(); i < Size; ++i)
	if (i != V2Index && !Zeroable[i]) {
	IsV1Zeroable = false;
	break;
	}

	// Check for a single input from a SCALAR_TO_VECTOR node.
	// FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
	// all the smarts here sunk into that routine. However, the current
	// lowering of BUILD_VECTOR makes that nearly impossible until the old
	// vector shuffle lowering is dead.
	SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
	DAG);
	if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
	// We need to zext the scalar if it is smaller than an i32.
	V2S = DAG.getBitcast(EltVT, V2S);
	if (EltVT == MVT::i8 \|\| EltVT == MVT::i16) {
	// Using zext to expand a narrow element won't work for non-zero
	// insertions.
	if (!IsV1Zeroable)
	return SDValue();

	// Zero-extend directly to i32.
	ExtVT = MVT::v4i32;
	V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
	}
	V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
	} else if (Mask[V2Index] != (int)Mask.size() \|\| EltVT == MVT::i8 \|\|
	EltVT == MVT::i16) {
	// Either not inserting from the low element of the input or the input
	// element size is too small to use VZEXT_MOVL to clear the high bits.
	return SDValue();
	}

	if (!IsV1Zeroable) {
	// If V1 can't be treated as a zero vector we have fewer options to lower
	// this. We can't support integer vectors or non-zero targets cheaply, and
	// the V1 elements can't be permuted in any way.
	assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
	if (!VT.isFloatingPoint() \|\| V2Index != 0)
	return SDValue();
	SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
	V1Mask[V2Index] = -1;
	if (!isNoopShuffleMask(V1Mask))
	return SDValue();
	// This is essentially a special case blend operation, but if we have
	// general purpose blend operations, they are always faster. Bail and let
	// the rest of the lowering handle these as blends.
	if (Subtarget.hasSSE41())
	return SDValue();

	// Otherwise, use MOVSD or MOVSS.
	assert((EltVT == MVT::f32 \|\| EltVT == MVT::f64) &&
	"Only two types of floating point element types to handle!");
	return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
	ExtVT, V1, V2);
	}

	// This lowering only works for the low element with floating point vectors.
	if (VT.isFloatingPoint() && V2Index != 0)
	return SDValue();

	V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
	if (ExtVT != VT)
	V2 = DAG.getBitcast(VT, V2);

	if (V2Index != 0) {
	// If we have 4 or fewer lanes we can cheaply shuffle the element into
	// the desired position. Otherwise it is more efficient to do a vector
	// shift left. We know that we can do a vector shift left because all
	// the inputs are zero.
	if (VT.isFloatingPoint() \|\| VT.getVectorNumElements() <= 4) {
	SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
	V2Shuffle[V2Index] = 0;
	V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
	} else {
	V2 = DAG.getBitcast(MVT::v16i8, V2);
	V2 = DAG.getNode(
	X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
	DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL,
	DAG.getTargetLoweringInfo().getScalarShiftAmountTy(
	DAG.getDataLayout(), VT)));
	V2 = DAG.getBitcast(VT, V2);
	}
	}
	return V2;
	}

	/// Try to lower broadcast of a single - truncated - integer element,
	/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
	///
	/// This assumes we have AVX2.
	static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
	SDValue V0, int BroadcastIdx,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.hasAVX2() &&
	"We can only lower integer broadcasts with AVX2!");

	EVT EltVT = VT.getVectorElementType();
	EVT V0VT = V0.getValueType();

	assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
	assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");

	EVT V0EltVT = V0VT.getVectorElementType();
	if (!V0EltVT.isInteger())
	return SDValue();

	const unsigned EltSize = EltVT.getSizeInBits();
	const unsigned V0EltSize = V0EltVT.getSizeInBits();

	// This is only a truncation if the original element type is larger.
	if (V0EltSize <= EltSize)
	return SDValue();

	assert(((V0EltSize % EltSize) == 0) &&
	"Scalar type sizes must all be powers of 2 on x86!");

	const unsigned V0Opc = V0.getOpcode();
	const unsigned Scale = V0EltSize / EltSize;
	const unsigned V0BroadcastIdx = BroadcastIdx / Scale;

	if ((V0Opc != ISD::SCALAR_TO_VECTOR \|\| V0BroadcastIdx != 0) &&
	V0Opc != ISD::BUILD_VECTOR)
	return SDValue();

	SDValue Scalar = V0.getOperand(V0BroadcastIdx);

	// If we're extracting non-least-significant bits, shift so we can truncate.
	// Hopefully, we can fold away the trunc/srl/load into the broadcast.
	// Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
	// vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
	if (const int OffsetIdx = BroadcastIdx % Scale)
	Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
	DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType()));

	return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
	DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
	}

	/// \brief Try to lower broadcast of a single element.
	///
	/// For convenience, this code also bundles all of the subtarget feature set
	/// filtering. While a little annoying to re-dispatch on type here, there isn't
	/// a convenient way to factor it out.
	static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
	SDValue V1, SDValue V2,
	ArrayRef<int> Mask,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) \|\|
	(Subtarget.hasAVX() && VT.isFloatingPoint()) \|\|
	(Subtarget.hasAVX2() && VT.isInteger())))
	return SDValue();

	// With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
	// we can only broadcast from a register with AVX2.
	unsigned NumElts = Mask.size();
	unsigned Opcode = VT == MVT::v2f64 ? X86ISD::MOVDDUP : X86ISD::VBROADCAST;
	bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) \|\| Subtarget.hasAVX2();

	// Check that the mask is a broadcast.
	int BroadcastIdx = -1;
	for (int i = 0; i != (int)NumElts; ++i) {
	SmallVector<int, 8> BroadcastMask(NumElts, i);
	if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {
	BroadcastIdx = i;
	break;
	}
	}

	if (BroadcastIdx < 0)
	return SDValue();
	assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
	"a sorted mask where the broadcast "
	"comes from V1.");

	// Go up the chain of (vector) values to find a scalar load that we can
	// combine with the broadcast.
	SDValue V = V1;
	for (;;) {
	switch (V.getOpcode()) {
	case ISD::BITCAST: {
	SDValue VSrc = V.getOperand(0);
	MVT SrcVT = VSrc.getSimpleValueType();
	if (VT.getScalarSizeInBits() != SrcVT.getScalarSizeInBits())
	break;
	V = VSrc;
	continue;
	}
	case ISD::CONCAT_VECTORS: {
	int OperandSize = Mask.size() / V.getNumOperands();
	V = V.getOperand(BroadcastIdx / OperandSize);
	BroadcastIdx %= OperandSize;
	continue;
	}
	case ISD::INSERT_SUBVECTOR: {
	SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
	auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
	if (!ConstantIdx)
	break;

	int BeginIdx = (int)ConstantIdx->getZExtValue();
	int EndIdx =
	BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements();
	if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
	BroadcastIdx -= BeginIdx;
	V = VInner;
	} else {
	V = VOuter;
	}
	continue;
	}
	}
	break;
	}

	// Check if this is a broadcast of a scalar. We special case lowering
	// for scalars so that we can more effectively fold with loads.
	// First, look through bitcast: if the original value has a larger element
	// type than the shuffle, the broadcast element is in essence truncated.
	// Make that explicit to ease folding.
	if (V.getOpcode() == ISD::BITCAST && VT.isInteger())
	if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast(
	DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))
	return TruncBroadcast;

	MVT BroadcastVT = VT;

	// Peek through any bitcast (only useful for loads).
	SDValue BC = peekThroughBitcasts(V);

	// Also check the simpler case, where we can directly reuse the scalar.
	if (V.getOpcode() == ISD::BUILD_VECTOR \|\|
	(V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
	V = V.getOperand(BroadcastIdx);

	// If we can't broadcast from a register, check that the input is a load.
	if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
	return SDValue();
	} else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) {
	// 32-bit targets need to load i64 as a f64 and then bitcast the result.
	if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
	BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
	Opcode = (BroadcastVT.is128BitVector() ? X86ISD::MOVDDUP : Opcode);
	}

	// If we are broadcasting a load that is only used by the shuffle
	// then we can reduce the vector load to the broadcasted scalar load.
	LoadSDNode *Ld = cast<LoadSDNode>(BC);
	SDValue BaseAddr = Ld->getOperand(1);
	EVT SVT = BroadcastVT.getScalarType();
	unsigned Offset = BroadcastIdx * SVT.getStoreSize();
	SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
	V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
	DAG.getMachineFunction().getMachineMemOperand(
	Ld->getMemOperand(), Offset, SVT.getStoreSize()));
	DAG.makeEquivalentMemoryOrdering(Ld, V);
	} else if (!BroadcastFromReg) {
	// We can't broadcast from a vector register.
	return SDValue();
	} else if (BroadcastIdx != 0) {
	// We can only broadcast from the zero-element of a vector register,
	// but it can be advantageous to broadcast from the zero-element of a
	// subvector.
	if (!VT.is256BitVector() && !VT.is512BitVector())
	return SDValue();

	// VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
	if (VT == MVT::v4f64 \|\| VT == MVT::v4i64)
	return SDValue();

	// Only broadcast the zero-element of a 128-bit subvector.
	unsigned EltSize = VT.getScalarSizeInBits();
	if (((BroadcastIdx * EltSize) % 128) != 0)
	return SDValue();

	// The shuffle input might have been a bitcast we looked through; look at
	// the original input vector. Emit an EXTRACT_SUBVECTOR of that type; we'll
	// later bitcast it to BroadcastVT.
	MVT SrcVT = V.getSimpleValueType();
	assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
	"Unexpected vector element size");
	assert((SrcVT.is256BitVector() \|\| SrcVT.is512BitVector()) &&
	"Unexpected vector size");

	MVT ExtVT = MVT::getVectorVT(SrcVT.getScalarType(), 128 / EltSize);
	V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, V,
	DAG.getIntPtrConstant(BroadcastIdx, DL));
	}

	if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
	V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
	DAG.getBitcast(MVT::f64, V));

	// Bitcast back to the same scalar type as BroadcastVT.
	MVT SrcVT = V.getSimpleValueType();
	if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) {
	assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
	"Unexpected vector element size");
	if (SrcVT.isVector()) {
	unsigned NumSrcElts = SrcVT.getVectorNumElements();
	SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
	} else {
	SrcVT = BroadcastVT.getScalarType();
	}
	V = DAG.getBitcast(SrcVT, V);
	}

	// 32-bit targets need to load i64 as a f64 and then bitcast the result.
	if (!Subtarget.is64Bit() && SrcVT == MVT::i64) {
	V = DAG.getBitcast(MVT::f64, V);
	unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements();
	BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
	}

	// We only support broadcasting from 128-bit vectors to minimize the
	// number of patterns we need to deal with in isel. So extract down to
	// 128-bits.
	if (SrcVT.getSizeInBits() > 128)
	V = extract128BitVector(V, 0, DAG, DL);

	return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
	}

	// Check for whether we can use INSERTPS to perform the shuffle. We only use
	// INSERTPS when the V1 elements are already in the correct locations
	// because otherwise we can just always use two SHUFPS instructions which
	// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
	// perform INSERTPS if a single V1 element is out of place and all V2
	// elements are zeroable.
	static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
	unsigned &InsertPSMask,
	const APInt &Zeroable,
	ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
	assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
	assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");

	// Attempt to match INSERTPS with one element from VA or VB being
	// inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
	// are updated.
	auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
	ArrayRef<int> CandidateMask) {
	unsigned ZMask = 0;
	int VADstIndex = -1;
	int VBDstIndex = -1;
	bool VAUsedInPlace = false;

	for (int i = 0; i < 4; ++i) {
	// Synthesize a zero mask from the zeroable elements (includes undefs).
	if (Zeroable[i]) {
	ZMask \|= 1 << i;
	continue;
	}

	// Flag if we use any VA inputs in place.
	if (i == CandidateMask[i]) {
	VAUsedInPlace = true;
	continue;
	}

	// We can only insert a single non-zeroable element.
	if (VADstIndex >= 0 \|\| VBDstIndex >= 0)
	return false;

	if (CandidateMask[i] < 4) {
	// VA input out of place for insertion.
	VADstIndex = i;
	} else {
	// VB input for insertion.
	VBDstIndex = i;
	}
	}

	// Don't bother if we have no (non-zeroable) element for insertion.
	if (VADstIndex < 0 && VBDstIndex < 0)
	return false;

	// Determine element insertion src/dst indices. The src index is from the
	// start of the inserted vector, not the start of the concatenated vector.
	unsigned VBSrcIndex = 0;
	if (VADstIndex >= 0) {
	// If we have a VA input out of place, we use VA as the V2 element
	// insertion and don't use the original V2 at all.
	VBSrcIndex = CandidateMask[VADstIndex];
	VBDstIndex = VADstIndex;
	VB = VA;
	} else {
	VBSrcIndex = CandidateMask[VBDstIndex] - 4;
	}

	// If no V1 inputs are used in place, then the result is created only from
	// the zero mask and the V2 insertion - so remove V1 dependency.
	if (!VAUsedInPlace)
	VA = DAG.getUNDEF(MVT::v4f32);

	// Update V1, V2 and InsertPSMask accordingly.
	V1 = VA;
	V2 = VB;

	// Insert the V2 element into the desired position.
	InsertPSMask = VBSrcIndex << 6 \| VBDstIndex << 4 \| ZMask;
	assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
	return true;
	};

	if (matchAsInsertPS(V1, V2, Mask))
	return true;

	// Commute and try again.
	SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
	ShuffleVectorSDNode::commuteMask(CommutedMask);
	if (matchAsInsertPS(V2, V1, CommutedMask))
	return true;

	return false;
	}

	static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");

	// Attempt to match the insertps pattern.
	unsigned InsertPSMask;
	if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
	return SDValue();

	// Insert the V2 element into the desired position.
	return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
	DAG.getConstant(InsertPSMask, DL, MVT::i8));
	}

	/// \brief Try to lower a shuffle as a permute of the inputs followed by an
	/// UNPCK instruction.
	///
	/// This specifically targets cases where we end up with alternating between
	/// the two inputs, and so can permute them into something that feeds a single
	/// UNPCK instruction. Note that this routine only targets integer vectors
	/// because for floating point vectors we have a generalized SHUFPS lowering
	/// strategy that handles everything that doesn't exactly match an unpack,
	/// making this clever lowering unnecessary.
	static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
	SDValue V1, SDValue V2,
	ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	assert(!VT.isFloatingPoint() &&
	"This routine only supports integer vectors.");
	assert(VT.is128BitVector() &&
	"This routine only works on 128-bit vectors.");
	assert(!V2.isUndef() &&
	"This routine should only be used when blending two inputs.");
	assert(Mask.size() >= 2 && "Single element masks are invalid.");

	int Size = Mask.size();

	int NumLoInputs =
	count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
	int NumHiInputs =
	count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });

	bool UnpackLo = NumLoInputs >= NumHiInputs;

	auto TryUnpack = [&](int ScalarSize, int Scale) {
	SmallVector<int, 16> V1Mask((unsigned)Size, -1);
	SmallVector<int, 16> V2Mask((unsigned)Size, -1);

	for (int i = 0; i < Size; ++i) {
	if (Mask[i] < 0)
	continue;

	// Each element of the unpack contains Scale elements from this mask.
	int UnpackIdx = i / Scale;

	// We only handle the case where V1 feeds the first slots of the unpack.
	// We rely on canonicalization to ensure this is the case.
	if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
	return SDValue();

	// Setup the mask for this input. The indexing is tricky as we have to
	// handle the unpack stride.
	SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
	VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
	Mask[i] % Size;
	}

	// If we will have to shuffle both inputs to use the unpack, check whether
	// we can just unpack first and shuffle the result. If so, skip this unpack.
	if ((NumLoInputs == 0 \|\| NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
	!isNoopShuffleMask(V2Mask))
	return SDValue();

	// Shuffle the inputs into place.
	V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
	V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);

	// Cast the inputs to the type we will use to unpack them.
	MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
	V1 = DAG.getBitcast(UnpackVT, V1);
	V2 = DAG.getBitcast(UnpackVT, V2);

	// Unpack the inputs and cast the result back to the desired type.
	return DAG.getBitcast(
	VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
	UnpackVT, V1, V2));
	};

	// We try each unpack from the largest to the smallest to try and find one
	// that fits this mask.
	int OrigScalarSize = VT.getScalarSizeInBits();
	for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
	if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
	return Unpack;

	// If none of the unpack-rooted lowerings worked (or were profitable) try an
	// initial unpack.
	if (NumLoInputs == 0 \|\| NumHiInputs == 0) {
	assert((NumLoInputs > 0 \|\| NumHiInputs > 0) &&
	"We have to have some inputs!");
	int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;

	// FIXME: We could consider the total complexity of the permute of each
	// possible unpacking. Or at the least we should consider how many
	// half-crossings are created.
	// FIXME: We could consider commuting the unpacks.

	SmallVector<int, 32> PermMask((unsigned)Size, -1);
	for (int i = 0; i < Size; ++i) {
	if (Mask[i] < 0)
	continue;

	assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");

	PermMask[i] =
	2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
	}
	return DAG.getVectorShuffle(
	VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
	DL, VT, V1, V2),
	DAG.getUNDEF(VT), PermMask);
	}

	return SDValue();
	}

	/// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
	///
	/// This is the basis function for the 2-lane 64-bit shuffles as we have full
	/// support for floating point shuffles but not integer shuffles. These
	/// instructions will incur a domain crossing penalty on some chips though so
	/// it is better to avoid lowering through this for integer vectors where
	/// possible.
	static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
	assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");

	if (V2.isUndef()) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
	DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
	return Broadcast;

	// Straight shuffle of a single input vector. Simulate this by using the
	// single input as both of the "inputs" to this instruction..
	unsigned SHUFPDMask = (Mask[0] == 1) \| ((Mask[1] == 1) << 1);

	if (Subtarget.hasAVX()) {
	// If we have AVX, we can use VPERMILPS which will allow folding a load
	// into the shuffle.
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
	DAG.getConstant(SHUFPDMask, DL, MVT::i8));
	}

	return DAG.getNode(
	X86ISD::SHUFP, DL, MVT::v2f64,
	Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
	Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
	DAG.getConstant(SHUFPDMask, DL, MVT::i8));
	}
	assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
	assert(Mask[1] >= 2 && "Non-canonicalized blend!");

	// If we have a single input, insert that into V1 if we can do so cheaply.
	if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
	if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
	DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return Insertion;
	// Try inverting the insertion since for v2 masks it is easy to do and we
	// can't reliably sort the mask one way or the other.
	int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
	Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
	if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
	DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
	return Insertion;
	}

	// Try to use one of the special instruction patterns to handle two common
	// blend patterns if a zero-blend above didn't work.
	if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) \|\|
	isShuffleEquivalent(V1, V2, Mask, {1, 3}))
	if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
	// We can either use a special instruction to load over the low double or
	// to move just the low double.
	return DAG.getNode(
	isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
	DL, MVT::v2f64, V2,
	DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));

	if (Subtarget.hasSSE41())
	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
	return V;

	unsigned SHUFPDMask = (Mask[0] == 1) \| (((Mask[1] - 2) == 1) << 1);
	return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
	DAG.getConstant(SHUFPDMask, DL, MVT::i8));
	}

	/// \brief Handle lowering of 2-lane 64-bit integer shuffles.
	///
	/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
	/// the integer unit to minimize domain crossing penalties. However, for blends
	/// it falls back to the floating point shuffle operation with appropriate bit
	/// casting.
	static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
	assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");

	if (V2.isUndef()) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
	DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
	return Broadcast;

	// Straight shuffle of a single input vector. For everything from SSE2
	// onward this has a single fast instruction with no scary immediates.
	// We have to map the mask as it is actually a v4i32 shuffle instruction.
	V1 = DAG.getBitcast(MVT::v4i32, V1);
	int WidenedMask[4] = {
	std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
	std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
	return DAG.getBitcast(
	MVT::v2i64,
	DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
	getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
	}
	assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
	assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
	assert(Mask[0] < 2 && "We sort V1 to be the first input.");
	assert(Mask[1] >= 2 && "We sort V2 to be the second input.");

	// If we have a blend of two same-type PACKUS operations and the blend aligns
	// with the low and high halves, we can just merge the PACKUS operations.
	// This is particularly important as it lets us merge shuffles that this
	// routine itself creates.
	auto GetPackNode = [](SDValue V) {
	V = peekThroughBitcasts(V);
	return V.getOpcode() == X86ISD::PACKUS ? V : SDValue();
	};
	if (SDValue V1Pack = GetPackNode(V1))
	if (SDValue V2Pack = GetPackNode(V2)) {
	EVT PackVT = V1Pack.getValueType();
	if (PackVT == V2Pack.getValueType())
	return DAG.getBitcast(MVT::v2i64,
	DAG.getNode(X86ISD::PACKUS, DL, PackVT,
	Mask[0] == 0 ? V1Pack.getOperand(0)
	: V1Pack.getOperand(1),
	Mask[1] == 2 ? V2Pack.getOperand(0)
	: V2Pack.getOperand(1)));
	}

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// When loading a scalar and then shuffling it into a vector we can often do
	// the insertion cheaply.
	if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
	DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return Insertion;
	// Try inverting the insertion since for v2 masks it is easy to do and we
	// can't reliably sort the mask one way or the other.
	int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
	if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
	DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
	return Insertion;

	// We have different paths for blend lowering, but they all must use the
	// exact same predicate.
	bool IsBlendSupported = Subtarget.hasSSE41();
	if (IsBlendSupported)
	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
	return V;

	// Try to use byte rotation instructions.
	// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
	if (Subtarget.hasSSSE3())
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
	DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
	return Rotate;

	// If we have direct support for blends, we should lower by decomposing into
	// a permute. That will be faster than the domain cross.
	if (IsBlendSupported)
	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
	Mask, DAG);

	// We implement this with SHUFPD which is pretty lame because it will likely
	// incur 2 cycles of stall for integer vectors on Nehalem and older chips.
	// However, all the alternatives are still more cycles and newer chips don't
	// have this problem. It would be really nice if x86 had better shuffles here.
	V1 = DAG.getBitcast(MVT::v2f64, V1);
	V2 = DAG.getBitcast(MVT::v2f64, V2);
	return DAG.getBitcast(MVT::v2i64,
	DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
	}

	/// \brief Test whether this can be lowered with a single SHUFPS instruction.
	///
	/// This is used to disable more specialized lowerings when the shufps lowering
	/// will happen to be efficient.
	static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
	// This routine only handles 128-bit shufps.
	assert(Mask.size() == 4 && "Unsupported mask size!");
	assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
	assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
	assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
	assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");

	// To lower with a single SHUFPS we need to have the low half and high half
	// each requiring a single input.
	if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
	return false;
	if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
	return false;

	return true;
	}

	/// \brief Lower a vector shuffle using the SHUFPS instruction.
	///
	/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
	/// It makes no assumptions about whether this is the best lowering, it simply
	/// uses it.
	static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
	ArrayRef<int> Mask, SDValue V1,
	SDValue V2, SelectionDAG &DAG) {
	SDValue LowV = V1, HighV = V2;
	int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};

	int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });

	if (NumV2Elements == 1) {
	int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();

	// Compute the index adjacent to V2Index and in the same half by toggling
	// the low bit.
	int V2AdjIndex = V2Index ^ 1;

	if (Mask[V2AdjIndex] < 0) {
	// Handles all the cases where we have a single V2 element and an undef.
	// This will only ever happen in the high lanes because we commute the
	// vector otherwise.
	if (V2Index < 2)
	std::swap(LowV, HighV);
	NewMask[V2Index] -= 4;
	} else {
	// Handle the case where the V2 element ends up adjacent to a V1 element.
	// To make this work, blend them together as the first step.
	int V1Index = V2AdjIndex;
	int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
	V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
	getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));

	// Now proceed to reconstruct the final blend as we have the necessary
	// high or low half formed.
	if (V2Index < 2) {
	LowV = V2;
	HighV = V1;
	} else {
	HighV = V2;
	}
	NewMask[V1Index] = 2; // We put the V1 element in V2[2].
	NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
	}
	} else if (NumV2Elements == 2) {
	if (Mask[0] < 4 && Mask[1] < 4) {
	// Handle the easy case where we have V1 in the low lanes and V2 in the
	// high lanes.
	NewMask[2] -= 4;
	NewMask[3] -= 4;
	} else if (Mask[2] < 4 && Mask[3] < 4) {
	// We also handle the reversed case because this utility may get called
	// when we detect a SHUFPS pattern but can't easily commute the shuffle to
	// arrange things in the right direction.
	NewMask[0] -= 4;
	NewMask[1] -= 4;
	HighV = V1;
	LowV = V2;
	} else {
	// We have a mixture of V1 and V2 in both low and high lanes. Rather than
	// trying to place elements directly, just blend them and set up the final
	// shuffle to place them.

	// The first two blend mask elements are for V1, the second two are for
	// V2.
	int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
	Mask[2] < 4 ? Mask[2] : Mask[3],
	(Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
	(Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
	V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
	getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));

	// Now we do a normal shuffle of V1 by giving V1 as both operands to
	// a blend.
	LowV = HighV = V1;
	NewMask[0] = Mask[0] < 4 ? 0 : 2;
	NewMask[1] = Mask[0] < 4 ? 2 : 0;
	NewMask[2] = Mask[2] < 4 ? 1 : 3;
	NewMask[3] = Mask[2] < 4 ? 3 : 1;
	}
	}
	return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
	getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
	}

	/// \brief Lower 4-lane 32-bit floating point shuffles.
	///
	/// Uses instructions exclusively from the floating point unit to minimize
	/// domain crossing penalties, as these are sufficient to implement all v4f32
	/// shuffles.
	static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
	assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");

	int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });

	if (NumV2Elements == 0) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
	DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG))
	return Broadcast;

	// Use even/odd duplicate instructions for masks that match their pattern.
	if (Subtarget.hasSSE3()) {
	if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
	return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
	if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
	return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
	}

	if (Subtarget.hasAVX()) {
	// If we have AVX, we can use VPERMILPS which will allow folding a load
	// into the shuffle.
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
	}

	// Otherwise, use a straight shuffle of a single input vector. We pass the
	// input vector to both operands to simulate this with a SHUFPS.
	return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
	}

	// There are special ways we can lower some single-element blends. However, we
	// have custom ways we can lower more complex single-element blends below that
	// we defer to if both this and BLENDPS fail to match, so restrict this to
	// when the V2 input is targeting element 0 of the mask -- that is the fast
	// case here.
	if (NumV2Elements == 1 && Mask[0] >= 4)
	if (SDValue V = lowerVectorShuffleAsElementInsertion(
	DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return V;

	if (Subtarget.hasSSE41()) {
	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Use INSERTPS if we can complete the shuffle efficiently.
	if (SDValue V =
	lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
	return V;

	if (!isSingleSHUFPSMask(Mask))
	if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
	DL, MVT::v4f32, V1, V2, Mask, DAG))
	return BlendPerm;
	}

	// Use low/high mov instructions.
	if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
	return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
	if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
	return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
	return V;

	// Otherwise fall back to a SHUFPS lowering strategy.
	return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
	}

	/// \brief Lower 4-lane i32 vector shuffles.
	///
	/// We try to handle these with integer-domain shuffles where we can, but for
	/// blends we use the floating point domain blend instructions.
	static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
	assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
	DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });

	if (NumV2Elements == 0) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
	DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
	return Broadcast;

	// Straight shuffle of a single input vector. For everything from SSE2
	// onward this has a single fast instruction with no scary immediates.
	// We coerce the shuffle pattern to be compatible with UNPCK instructions
	// but we aren't actually going to use the UNPCK instruction because doing
	// so prevents folding a load into this instruction or making a copy.
	const int UnpackLoMask[] = {0, 0, 1, 1};
	const int UnpackHiMask[] = {2, 2, 3, 3};
	if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
	Mask = UnpackLoMask;
	else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
	Mask = UnpackHiMask;

	return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
	}

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// There are special ways we can lower some single-element blends.
	if (NumV2Elements == 1)
	if (SDValue V = lowerVectorShuffleAsElementInsertion(
	DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return V;

	// We have different paths for blend lowering, but they all must use the
	// exact same predicate.
	bool IsBlendSupported = Subtarget.hasSSE41();
	if (IsBlendSupported)
	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
	Zeroable, DAG))
	return Masked;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
	return V;

	// Try to use byte rotation instructions.
	// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
	if (Subtarget.hasSSSE3())
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
	DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
	return Rotate;

	// Assume that a single SHUFPS is faster than an alternative sequence of
	// multiple instructions (even if the CPU has a domain penalty).
	// If some CPU is harmed by the domain switch, we can fix it in a later pass.
	if (!isSingleSHUFPSMask(Mask)) {
	// If we have direct support for blends, we should lower by decomposing into
	// a permute. That will be faster than the domain cross.
	if (IsBlendSupported)
	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
	Mask, DAG);

	// Try to lower by permuting the inputs into an unpack instruction.
	if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
	DL, MVT::v4i32, V1, V2, Mask, DAG))
	return Unpack;
	}

	// We implement this with SHUFPS because it can blend from two vectors.
	// Because we're going to eventually use SHUFPS, we use SHUFPS even to build
	// up the inputs, bypassing domain shift penalties that we would incur if we
	// directly used PSHUFD on Nehalem and older. For newer chips, this isn't
	// relevant.
	SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
	SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
	SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
	return DAG.getBitcast(MVT::v4i32, ShufPS);
	}

	/// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
	/// shuffle lowering, and the most complex part.
	///
	/// The lowering strategy is to try to form pairs of input lanes which are
	/// targeted at the same half of the final vector, and then use a dword shuffle
	/// to place them onto the right half, and finally unpack the paired lanes into
	/// their final position.
	///
	/// The exact breakdown of how to form these dword pairs and align them on the
	/// correct sides is really tricky. See the comments within the function for
	/// more of the details.
	///
	/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
	/// lane must shuffle the exact same way. In fact, you must pass a v8 Mask to
	/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
	/// vector, form the analogous 128-bit 8-element Mask.
	static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
	const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
	const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
	MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);

	assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
	MutableArrayRef<int> LoMask = Mask.slice(0, 4);
	MutableArrayRef<int> HiMask = Mask.slice(4, 4);

	SmallVector<int, 4> LoInputs;
	copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
	std::sort(LoInputs.begin(), LoInputs.end());
	LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
	SmallVector<int, 4> HiInputs;
	copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
	std::sort(HiInputs.begin(), HiInputs.end());
	HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
	int NumLToL =
	std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
	int NumHToL = LoInputs.size() - NumLToL;
	int NumLToH =
	std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
	int NumHToH = HiInputs.size() - NumLToH;
	MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
	MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
	MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
	MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);

	// If we are splatting two values from one half - one to each half, then
	// we can shuffle that half so each is splatted to a dword, then splat those
	// to their respective halves.
	auto SplatHalfs = [&](int LoInput, int HiInput, unsigned ShufWOp,
	int DOffset) {
	int PSHUFHalfMask[] = {LoInput % 4, LoInput % 4, HiInput % 4, HiInput % 4};
	int PSHUFDMask[] = {DOffset + 0, DOffset + 0, DOffset + 1, DOffset + 1};
	V = DAG.getNode(ShufWOp, DL, VT, V,
	getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
	V = DAG.getBitcast(PSHUFDVT, V);
	V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
	return DAG.getBitcast(VT, V);
	};

	if (NumLToL == 1 && NumLToH == 1 && (NumHToL + NumHToH) == 0)
	return SplatHalfs(LToLInputs[0], LToHInputs[0], X86ISD::PSHUFLW, 0);
	if (NumHToL == 1 && NumHToH == 1 && (NumLToL + NumLToH) == 0)
	return SplatHalfs(HToLInputs[0], HToHInputs[0], X86ISD::PSHUFHW, 2);

	// Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
	// such inputs we can swap two of the dwords across the half mark and end up
	// with <=2 inputs to each half in each half. Once there, we can fall through
	// to the generic code below. For example:
	//
	// Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
	// Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
	//
	// However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
	// and an existing 2-into-2 on the other half. In this case we may have to
	// pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
	// 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
	// Fortunately, we don't have to handle anything but a 2-into-2 pattern
	// because any other situation (including a 3-into-1 or 1-into-3 in the other
	// half than the one we target for fixing) will be fixed when we re-enter this
	// path. We will also combine away any sequence of PSHUFD instructions that
	// result into a single instruction. Here is an example of the tricky case:
	//
	// Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
	// Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
	//
	// This now has a 1-into-3 in the high half! Instead, we do two shuffles:
	//
	// Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
	// Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
	//
	// Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
	// Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
	//
	// The result is fine to be handled by the generic logic.
	auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
	ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
	int AOffset, int BOffset) {
	assert((AToAInputs.size() == 3 \|\| AToAInputs.size() == 1) &&
	"Must call this with A having 3 or 1 inputs from the A half.");
	assert((BToAInputs.size() == 1 \|\| BToAInputs.size() == 3) &&
	"Must call this with B having 1 or 3 inputs from the B half.");
	assert(AToAInputs.size() + BToAInputs.size() == 4 &&
	"Must call this with either 3:1 or 1:3 inputs (summing to 4).");

	bool ThreeAInputs = AToAInputs.size() == 3;

	// Compute the index of dword with only one word among the three inputs in
	// a half by taking the sum of the half with three inputs and subtracting
	// the sum of the actual three inputs. The difference is the remaining
	// slot.
	int ADWord, BDWord;
	int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
	int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
	int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
	ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
	int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
	int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
	int TripleNonInputIdx =
	TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
	TripleDWord = TripleNonInputIdx / 2;

	// We use xor with one to compute the adjacent DWord to whichever one the
	// OneInput is in.
	OneInputDWord = (OneInput / 2) ^ 1;

	// Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
	// and BToA inputs. If there is also such a problem with the BToB and AToB
	// inputs, we don't try to fix it necessarily -- we'll recurse and see it in
	// the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
	// is essential that we don't create a 3<-1 as then we might oscillate.
	if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
	// Compute how many inputs will be flipped by swapping these DWords. We
	// need
	// to balance this to ensure we don't form a 3-1 shuffle in the other
	// half.
	int NumFlippedAToBInputs =
	std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
	std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
	int NumFlippedBToBInputs =
	std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
	std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
	if ((NumFlippedAToBInputs == 1 &&
	(NumFlippedBToBInputs == 0 \|\| NumFlippedBToBInputs == 2)) \|\|
	(NumFlippedBToBInputs == 1 &&
	(NumFlippedAToBInputs == 0 \|\| NumFlippedAToBInputs == 2))) {
	// We choose whether to fix the A half or B half based on whether that
	// half has zero flipped inputs. At zero, we may not be able to fix it
	// with that half. We also bias towards fixing the B half because that
	// will more commonly be the high half, and we have to bias one way.
	auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
	ArrayRef<int> Inputs) {
	int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
	bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
	// Determine whether the free index is in the flipped dword or the
	// unflipped dword based on where the pinned index is. We use this bit
	// in an xor to conditionally select the adjacent dword.
	int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
	bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
	if (IsFixIdxInput == IsFixFreeIdxInput)
	FixFreeIdx += 1;
	IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
	assert(IsFixIdxInput != IsFixFreeIdxInput &&
	"We need to be changing the number of flipped inputs!");
	int PSHUFHalfMask[] = {0, 1, 2, 3};
	std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
	V = DAG.getNode(
	FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
	MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
	getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));

	for (int &M : Mask)
	if (M >= 0 && M == FixIdx)
	M = FixFreeIdx;
	else if (M >= 0 && M == FixFreeIdx)
	M = FixIdx;
	};
	if (NumFlippedBToBInputs != 0) {
	int BPinnedIdx =
	BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
	FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
	} else {
	assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
	int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
	FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
	}
	}
	}

	int PSHUFDMask[] = {0, 1, 2, 3};
	PSHUFDMask[ADWord] = BDWord;
	PSHUFDMask[BDWord] = ADWord;
	V = DAG.getBitcast(
	VT,
	DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

	// Adjust the mask to match the new locations of A and B.
	for (int &M : Mask)
	if (M >= 0 && M/2 == ADWord)
	M = 2 * BDWord + M % 2;
	else if (M >= 0 && M/2 == BDWord)
	M = 2 * ADWord + M % 2;

	// Recurse back into this routine to re-compute state now that this isn't
	// a 3 and 1 problem.
	return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
	DAG);
	};
	if ((NumLToL == 3 && NumHToL == 1) \|\| (NumLToL == 1 && NumHToL == 3))
	return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
	if ((NumHToH == 3 && NumLToH == 1) \|\| (NumHToH == 1 && NumLToH == 3))
	return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);

	// At this point there are at most two inputs to the low and high halves from
	// each half. That means the inputs can always be grouped into dwords and
	// those dwords can then be moved to the correct half with a dword shuffle.
	// We use at most one low and one high word shuffle to collect these paired
	// inputs into dwords, and finally a dword shuffle to place them.
	int PSHUFLMask[4] = {-1, -1, -1, -1};
	int PSHUFHMask[4] = {-1, -1, -1, -1};
	int PSHUFDMask[4] = {-1, -1, -1, -1};

	// First fix the masks for all the inputs that are staying in their
	// original halves. This will then dictate the targets of the cross-half
	// shuffles.
	auto fixInPlaceInputs =
	[&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
	MutableArrayRef<int> SourceHalfMask,
	MutableArrayRef<int> HalfMask, int HalfOffset) {
	if (InPlaceInputs.empty())
	return;
	if (InPlaceInputs.size() == 1) {
	SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
	InPlaceInputs[0] - HalfOffset;
	PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
	return;
	}
	if (IncomingInputs.empty()) {
	// Just fix all of the in place inputs.
	for (int Input : InPlaceInputs) {
	SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
	PSHUFDMask[Input / 2] = Input / 2;
	}
	return;
	}

	assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
	SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
	InPlaceInputs[0] - HalfOffset;
	// Put the second input next to the first so that they are packed into
	// a dword. We find the adjacent index by toggling the low bit.
	int AdjIndex = InPlaceInputs[0] ^ 1;
	SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
	std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
	PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
	};
	fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
	fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);

	// Now gather the cross-half inputs and place them into a free dword of
	// their target half.
	// FIXME: This operation could almost certainly be simplified dramatically to
	// look more like the 3-1 fixing operation.
	auto moveInputsToRightHalf = [&PSHUFDMask](
	MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
	MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
	MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
	int DestOffset) {
	auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
	return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
	};
	auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
	int Word) {
	int LowWord = Word & ~1;
	int HighWord = Word \| 1;
	return isWordClobbered(SourceHalfMask, LowWord) \|\|
	isWordClobbered(SourceHalfMask, HighWord);
	};

	if (IncomingInputs.empty())
	return;

	if (ExistingInputs.empty()) {
	// Map any dwords with inputs from them into the right half.
	for (int Input : IncomingInputs) {
	// If the source half mask maps over the inputs, turn those into
	// swaps and use the swapped lane.
	if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
	if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
	SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
	Input - SourceOffset;
	// We have to swap the uses in our half mask in one sweep.
	for (int &M : HalfMask)
	if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
	M = Input;
	else if (M == Input)
	M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
	} else {
	assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
	Input - SourceOffset &&
	"Previous placement doesn't match!");
	}
	// Note that this correctly re-maps both when we do a swap and when
	// we observe the other side of the swap above. We rely on that to
	// avoid swapping the members of the input list directly.
	Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
	}

	// Map the input's dword into the correct half.
	if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
	PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
	else
	assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
	Input / 2 &&
	"Previous placement doesn't match!");
	}

	// And just directly shift any other-half mask elements to be same-half
	// as we will have mirrored the dword containing the element into the
	// same position within that half.
	for (int &M : HalfMask)
	if (M >= SourceOffset && M < SourceOffset + 4) {
	M = M - SourceOffset + DestOffset;
	assert(M >= 0 && "This should never wrap below zero!");
	}
	return;
	}

	// Ensure we have the input in a viable dword of its current half. This
	// is particularly tricky because the original position may be clobbered
	// by inputs being moved and staying in that half.
	if (IncomingInputs.size() == 1) {
	if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
	int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
	SourceOffset;
	SourceHalfMask[InputFixed - SourceOffset] =
	IncomingInputs[0] - SourceOffset;
	std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
	InputFixed);
	IncomingInputs[0] = InputFixed;
	}
	} else if (IncomingInputs.size() == 2) {
	if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 \|\|
	isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
	// We have two non-adjacent or clobbered inputs we need to extract from
	// the source half. To do this, we need to map them into some adjacent
	// dword slot in the source mask.
	int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
	IncomingInputs[1] - SourceOffset};

	// If there is a free slot in the source half mask adjacent to one of
	// the inputs, place the other input in it. We use (Index XOR 1) to
	// compute an adjacent index.
	if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
	SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
	SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
	SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
	InputsFixed[1] = InputsFixed[0] ^ 1;
	} else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
	SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
	SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
	SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
	InputsFixed[0] = InputsFixed[1] ^ 1;
	} else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
	SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
	// The two inputs are in the same DWord but it is clobbered and the
	// adjacent DWord isn't used at all. Move both inputs to the free
	// slot.
	SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
	SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
	InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
	InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
	} else {
	// The only way we hit this point is if there is no clobbering
	// (because there are no off-half inputs to this half) and there is no
	// free slot adjacent to one of the inputs. In this case, we have to
	// swap an input with a non-input.
	for (int i = 0; i < 4; ++i)
	assert((SourceHalfMask[i] < 0 \|\| SourceHalfMask[i] == i) &&
	"We can't handle any clobbers here!");
	assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
	"Cannot have adjacent inputs here!");

	SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
	SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;

	// We also have to update the final source mask in this case because
	// it may need to undo the above swap.
	for (int &M : FinalSourceHalfMask)
	if (M == (InputsFixed[0] ^ 1) + SourceOffset)
	M = InputsFixed[1] + SourceOffset;
	else if (M == InputsFixed[1] + SourceOffset)
	M = (InputsFixed[0] ^ 1) + SourceOffset;

	InputsFixed[1] = InputsFixed[0] ^ 1;
	}

	// Point everything at the fixed inputs.
	for (int &M : HalfMask)
	if (M == IncomingInputs[0])
	M = InputsFixed[0] + SourceOffset;
	else if (M == IncomingInputs[1])
	M = InputsFixed[1] + SourceOffset;

	IncomingInputs[0] = InputsFixed[0] + SourceOffset;
	IncomingInputs[1] = InputsFixed[1] + SourceOffset;
	}
	} else {
	llvm_unreachable("Unhandled input size!");
	}

	// Now hoist the DWord down to the right half.
	int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
	assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
	PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
	for (int &M : HalfMask)
	for (int Input : IncomingInputs)
	if (M == Input)
	M = FreeDWord * 2 + Input % 2;
	};
	moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
	/SourceOffset/ 4, /DestOffset/ 0);
	moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
	/SourceOffset/ 0, /DestOffset/ 4);

	// Now enact all the shuffles we've computed to move the inputs into their
	// target half.
	if (!isNoopShuffleMask(PSHUFLMask))
	V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
	getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
	if (!isNoopShuffleMask(PSHUFHMask))
	V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
	getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
	if (!isNoopShuffleMask(PSHUFDMask))
	V = DAG.getBitcast(
	VT,
	DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

	// At this point, each half should contain all its inputs, and we can then
	// just shuffle them into their final position.
	assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
	"Failed to lift all the high half inputs to the low mask!");
	assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
	"Failed to lift all the low half inputs to the high mask!");

	// Do a half shuffle for the low mask.
	if (!isNoopShuffleMask(LoMask))
	V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
	getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));

	// Do a half shuffle with the high mask after shifting its values down.
	for (int &M : HiMask)
	if (M >= 0)
	M -= 4;
	if (!isNoopShuffleMask(HiMask))
	V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
	getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));

	return V;
	}

	/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
	/// blend if only one input is used.
	static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse,
	bool &V2InUse) {
	SDValue V1Mask[16];
	SDValue V2Mask[16];
	V1InUse = false;
	V2InUse = false;

	int Size = Mask.size();
	int Scale = 16 / Size;
	for (int i = 0; i < 16; ++i) {
	if (Mask[i / Scale] < 0) {
	V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
	} else {
	const int ZeroMask = 0x80;
	int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale
	: ZeroMask;
	int V2Idx = Mask[i / Scale] < Size
	? ZeroMask
	: (Mask[i / Scale] - Size) * Scale + i % Scale;
	if (Zeroable[i / Scale])
	V1Idx = V2Idx = ZeroMask;
	V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
	V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
	V1InUse \|= (ZeroMask != V1Idx);
	V2InUse \|= (ZeroMask != V2Idx);
	}
	}

	if (V1InUse)
	V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
	DAG.getBitcast(MVT::v16i8, V1),
	DAG.getBuildVector(MVT::v16i8, DL, V1Mask));
	if (V2InUse)
	V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
	DAG.getBitcast(MVT::v16i8, V2),
	DAG.getBuildVector(MVT::v16i8, DL, V2Mask));

	// If we need shuffled inputs from both, blend the two.
	SDValue V;
	if (V1InUse && V2InUse)
	V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
	else
	V = V1InUse ? V1 : V2;

	// Cast the result back to the correct type.
	return DAG.getBitcast(VT, V);
	}

	/// \brief Generic lowering of 8-lane i16 shuffles.
	///
	/// This handles both single-input shuffles and combined shuffle/blends with
	/// two inputs. The single input shuffles are immediately delegated to
	/// a dedicated lowering routine.
	///
	/// The blends are lowered in one of three fundamental ways. If there are few
	/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
	/// of the input is significantly cheaper when lowered as an interleaving of
	/// the two inputs, try to interleave them. Otherwise, blend the low and high
	/// halves of the inputs separately (making them have relatively few inputs)
	/// and then concatenate them.
	static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
	assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative.
	if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
	DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });

	if (NumV2Inputs == 0) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
	DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
	return Broadcast;

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
	return V;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
	Mask, Subtarget, DAG))
	return Rotate;

	// Make a copy of the mask so it can be modified.
	SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
	return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1,
	MutableMask, Subtarget,
	DAG);
	}

	assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
	"All single-input shuffles should be canonicalized to be V1-input "
	"shuffles.");

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// See if we can use SSE4A Extraction / Insertion.
	if (Subtarget.hasSSE4A())
	if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
	Zeroable, DAG))
	return V;

	// There are special ways we can lower some single-element blends.
	if (NumV2Inputs == 1)
	if (SDValue V = lowerVectorShuffleAsElementInsertion(
	DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return V;

	// We have different paths for blend lowering, but they all must use the
	// exact same predicate.
	bool IsBlendSupported = Subtarget.hasSSE41();
	if (IsBlendSupported)
	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
	Zeroable, DAG))
	return Masked;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
	return V;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
	DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
	return Rotate;

	if (SDValue BitBlend =
	lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
	return BitBlend;

	// Try to lower by permuting the inputs into an unpack instruction.
	if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1,
	V2, Mask, DAG))
	return Unpack;

	// If we can't directly blend but can use PSHUFB, that will be better as it
	// can both shuffle and set up the inefficient blend.
	if (!IsBlendSupported && Subtarget.hasSSSE3()) {
	bool V1InUse, V2InUse;
	return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
	Zeroable, DAG, V1InUse, V2InUse);
	}

	// We can always bit-blend if we have to so the fallback strategy is to
	// decompose into single-input permutes and blends.
	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
	Mask, DAG);
	}

	/// \brief Check whether a compaction lowering can be done by dropping even
	/// elements and compute how many times even elements must be dropped.
	///
	/// This handles shuffles which take every Nth element where N is a power of
	/// two. Example shuffle masks:
	///
	/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
	/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
	/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
	/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
	/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
	/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
	///
	/// Any of these lanes can of course be undef.
	///
	/// This routine only supports N <= 3.
	/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
	/// for larger N.
	///
	/// \returns N above, or the number of times even elements must be dropped if
	/// there is such a number. Otherwise returns zero.
	static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
	bool IsSingleInput) {
	// The modulus for the shuffle vector entries is based on whether this is
	// a single input or not.
	int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
	assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
	"We should only be called with masks with a power-of-2 size!");

	uint64_t ModMask = (uint64_t)ShuffleModulus - 1;

	// We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
	// and 2^3 simultaneously. This is because we may have ambiguity with
	// partially undef inputs.
	bool ViableForN[3] = {true, true, true};

	for (int i = 0, e = Mask.size(); i < e; ++i) {
	// Ignore undef lanes, we'll optimistically collapse them to the pattern we
	// want.
	if (Mask[i] < 0)
	continue;

	bool IsAnyViable = false;
	for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
	if (ViableForN[j]) {
	uint64_t N = j + 1;

	// The shuffle mask must be equal to (i * 2^N) % M.
	if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
	IsAnyViable = true;
	else
	ViableForN[j] = false;
	}
	// Early exit if we exhaust the possible powers of two.
	if (!IsAnyViable)
	break;
	}

	for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
	if (ViableForN[j])
	return j + 1;

	// Return 0 as there is no viable power of two.
	return 0;
	}

	/// \brief Generic lowering of v16i8 shuffles.
	///
	/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
	/// detect any complexity reducing interleaving. If that doesn't help, it uses
	/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
	/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
	/// back together.
	static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
	assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
	DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
	return Rotate;

	// Try to use a zext lowering.
	if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
	DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	// See if we can use SSE4A Extraction / Insertion.
	if (Subtarget.hasSSE4A())
	if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
	Zeroable, DAG))
	return V;

	int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });

	// For single-input shuffles, there are some nicer lowering tricks we can use.
	if (NumV2Elements == 0) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
	DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
	return Broadcast;

	// Check whether we can widen this to an i16 shuffle by duplicating bytes.
	// Notably, this handles splat and partial-splat shuffles more efficiently.
	// However, it only makes sense if the pre-duplication shuffle simplifies
	// things significantly. Currently, this means we need to be able to
	// express the pre-duplication shuffle as an i16 shuffle.
	//
	// FIXME: We should check for other patterns which can be widened into an
	// i16 shuffle as well.
	auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
	for (int i = 0; i < 16; i += 2)
	if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
	return false;

	return true;
	};
	auto tryToWidenViaDuplication = [&]() -> SDValue {
	if (!canWidenViaDuplication(Mask))
	return SDValue();
	SmallVector<int, 4> LoInputs;
	copy_if(Mask, std::back_inserter(LoInputs),
	[](int M) { return M >= 0 && M < 8; });
	std::sort(LoInputs.begin(), LoInputs.end());
	LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
	LoInputs.end());
	SmallVector<int, 4> HiInputs;
	copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
	std::sort(HiInputs.begin(), HiInputs.end());
	HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
	HiInputs.end());

	bool TargetLo = LoInputs.size() >= HiInputs.size();
	ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
	ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;

	int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
	SmallDenseMap<int, int, 8> LaneMap;
	for (int I : InPlaceInputs) {
	PreDupI16Shuffle[I/2] = I/2;
	LaneMap[I] = I;
	}
	int j = TargetLo ? 0 : 4, je = j + 4;
	for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
	// Check if j is already a shuffle of this input. This happens when
	// there are two adjacent bytes after we move the low one.
	if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
	// If we haven't yet mapped the input, search for a slot into which
	// we can map it.
	while (j < je && PreDupI16Shuffle[j] >= 0)
	++j;

	if (j == je)
	// We can't place the inputs into a single half with a simple i16 shuffle, so bail.
	return SDValue();

	// Map this input with the i16 shuffle.
	PreDupI16Shuffle[j] = MovingInputs[i] / 2;
	}

	// Update the lane map based on the mapping we ended up with.
	LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
	}
	V1 = DAG.getBitcast(
	MVT::v16i8,
	DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
	DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));

	// Unpack the bytes to form the i16s that will be shuffled into place.
	V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
	MVT::v16i8, V1, V1);

	int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
	for (int i = 0; i < 16; ++i)
	if (Mask[i] >= 0) {
	int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
	assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
	if (PostDupI16Shuffle[i / 2] < 0)
	PostDupI16Shuffle[i / 2] = MappedMask;
	else
	assert(PostDupI16Shuffle[i / 2] == MappedMask &&
	"Conflicting entries in the original shuffle!");
	}
	return DAG.getBitcast(
	MVT::v16i8,
	DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
	DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
	};
	if (SDValue V = tryToWidenViaDuplication())
	return V;
	}

	if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
	Zeroable, DAG))
	return Masked;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
	return V;

	// Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
	// with PSHUFB. It is important to do this before we attempt to generate any
	// blends but after all of the single-input lowerings. If the single input
	// lowerings can find an instruction sequence that is faster than a PSHUFB, we
	// want to preserve that and we can DAG combine any longer sequences into
	// a PSHUFB in the end. But once we start blending from multiple inputs,
	// the complexity of DAG combining bad patterns back into PSHUFB is too high,
	// and there are very few patterns that would actually be faster than the
	// PSHUFB approach because of its ability to zero lanes.
	//
	// FIXME: The only exceptions to the above are blends which are exact
	// interleavings with direct instructions supporting them. We currently don't
	// handle those well here.
	if (Subtarget.hasSSSE3()) {
	bool V1InUse = false;
	bool V2InUse = false;

	SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs(
	DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);

	// If both V1 and V2 are in use and we can use a direct blend or an unpack,
	// do so. This avoids using them to handle blends-with-zero which is
	// important as a single pshufb is significantly faster for that.
	if (V1InUse && V2InUse) {
	if (Subtarget.hasSSE41())
	if (SDValue Blend = lowerVectorShuffleAsBlend(
	DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return Blend;

	// We can use an unpack to do the blending rather than an or in some
	// cases. Even though the or may be (very minorly) more efficient, we
	// preference this lowering because there are common cases where part of
	// the complexity of the shuffles goes away when we do the final blend as
	// an unpack.
	// FIXME: It might be worth trying to detect if the unpack-feeding
	// shuffles will both be pshufb, in which case we shouldn't bother with
	// this.
	if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
	DL, MVT::v16i8, V1, V2, Mask, DAG))
	return Unpack;
	}

	return PSHUFB;
	}

	// There are special ways we can lower some single-element blends.
	if (NumV2Elements == 1)
	if (SDValue V = lowerVectorShuffleAsElementInsertion(
	DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return V;

	if (SDValue BitBlend =
	lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
	return BitBlend;

	// Check whether a compaction lowering can be done. This handles shuffles
	// which take every Nth element for some even N. See the helper function for
	// details.
	//
	// We special case these as they can be particularly efficiently handled with
	// the PACKUSB instruction on x86 and they show up in common patterns of
	// rearranging bytes to truncate wide elements.
	bool IsSingleInput = V2.isUndef();
	if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
	// NumEvenDrops is the power of two stride of the elements. Another way of
	// thinking about it is that we need to drop the even elements this many
	// times to get the original input.

	// First we need to zero all the dropped bytes.
	assert(NumEvenDrops <= 3 &&
	"No support for dropping even elements more than 3 times.");
	// We use the mask type to pick which bytes are preserved based on how many
	// elements are dropped.
	MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
	SDValue ByteClearMask = DAG.getBitcast(
	MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
	V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
	if (!IsSingleInput)
	V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);

	// Now pack things back together.
	V1 = DAG.getBitcast(MVT::v8i16, V1);
	V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
	SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
	for (int i = 1; i < NumEvenDrops; ++i) {
	Result = DAG.getBitcast(MVT::v8i16, Result);
	Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
	}

	return Result;
	}

	// Handle multi-input cases by blending single-input shuffles.
	if (NumV2Elements > 0)
	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
	Mask, DAG);

	// The fallback path for single-input shuffles widens this into two v8i16
	// vectors with unpacks, shuffles those, and then pulls them back together
	// with a pack.
	SDValue V = V1;

	std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
	std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
	for (int i = 0; i < 16; ++i)
	if (Mask[i] >= 0)
	(i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];

	SDValue VLoHalf, VHiHalf;
	// Check if any of the odd lanes in the v16i8 are used. If not, we can mask
	// them out and avoid using UNPCK{L,H} to extract the elements of V as
	// i16s.
	if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
	none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
	// Use a mask to drop the high bytes.
	VLoHalf = DAG.getBitcast(MVT::v8i16, V);
	VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
	DAG.getConstant(0x00FF, DL, MVT::v8i16));

	// This will be a single vector shuffle instead of a blend so nuke VHiHalf.
	VHiHalf = DAG.getUNDEF(MVT::v8i16);

	// Squash the masks to point directly into VLoHalf.
	for (int &M : LoBlendMask)
	if (M >= 0)
	M /= 2;
	for (int &M : HiBlendMask)
	if (M >= 0)
	M /= 2;
	} else {
	// Otherwise just unpack the low half of V into VLoHalf and the high half into
	// VHiHalf so that we can blend them as i16s.
	SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);

	VLoHalf = DAG.getBitcast(
	MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
	VHiHalf = DAG.getBitcast(
	MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
	}

	SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
	SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);

	return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
	}

	/// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
	///
	/// This routine breaks down the specific type of 128-bit shuffle and
	/// dispatches to the lowering routines accordingly.
	static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	MVT VT, SDValue V1, SDValue V2,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	switch (VT.SimpleTy) {
	case MVT::v2i64:
	return lowerV2I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v2f64:
	return lowerV2F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v4i32:
	return lowerV4I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v4f32:
	return lowerV4F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v8i16:
	return lowerV8I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v16i8:
	return lowerV16I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

	default:
	llvm_unreachable("Unimplemented!");
	}
	}

	/// \brief Generic routine to split vector shuffle into half-sized shuffles.
	///
	/// This routine just extracts two subvectors, shuffles them independently, and
	/// then concatenates them back together. This should work effectively with all
	/// AVX vector shuffle types.
	static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	assert(VT.getSizeInBits() >= 256 &&
	"Only for 256-bit or wider vector shuffles!");
	assert(V1.getSimpleValueType() == VT && "Bad operand type!");
	assert(V2.getSimpleValueType() == VT && "Bad operand type!");

	ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
	ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);

	int NumElements = VT.getVectorNumElements();
	int SplitNumElements = NumElements / 2;
	MVT ScalarVT = VT.getVectorElementType();
	MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);

	// Rather than splitting build-vectors, just build two narrower build
	// vectors. This helps shuffling with splats and zeros.
	auto SplitVector = [&](SDValue V) {
	V = peekThroughBitcasts(V);

	MVT OrigVT = V.getSimpleValueType();
	int OrigNumElements = OrigVT.getVectorNumElements();
	int OrigSplitNumElements = OrigNumElements / 2;
	MVT OrigScalarVT = OrigVT.getVectorElementType();
	MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);

	SDValue LoV, HiV;

	auto *BV = dyn_cast<BuildVectorSDNode>(V);
	if (!BV) {
	LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
	DAG.getIntPtrConstant(0, DL));
	HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
	DAG.getIntPtrConstant(OrigSplitNumElements, DL));
	} else {

	SmallVector<SDValue, 16> LoOps, HiOps;
	for (int i = 0; i < OrigSplitNumElements; ++i) {
	LoOps.push_back(BV->getOperand(i));
	HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
	}
	LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
	HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
	}
	return std::make_pair(DAG.getBitcast(SplitVT, LoV),
	DAG.getBitcast(SplitVT, HiV));
	};

	SDValue LoV1, HiV1, LoV2, HiV2;
	std::tie(LoV1, HiV1) = SplitVector(V1);
	std::tie(LoV2, HiV2) = SplitVector(V2);

	// Now create two 4-way blends of these half-width vectors.
	auto HalfBlend = [&](ArrayRef<int> HalfMask) {
	bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
	SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
	SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
	SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
	for (int i = 0; i < SplitNumElements; ++i) {
	int M = HalfMask[i];
	if (M >= NumElements) {
	if (M >= NumElements + SplitNumElements)
	UseHiV2 = true;
	else
	UseLoV2 = true;
	V2BlendMask[i] = M - NumElements;
	BlendMask[i] = SplitNumElements + i;
	} else if (M >= 0) {
	if (M >= SplitNumElements)
	UseHiV1 = true;
	else
	UseLoV1 = true;
	V1BlendMask[i] = M;
	BlendMask[i] = i;
	}
	}

	// Because the lowering happens after all combining takes place, we need to
	// manually combine these blend masks as much as possible so that we create
	// a minimal number of high-level vector shuffle nodes.

	// First try just blending the halves of V1 or V2.
	if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
	return DAG.getUNDEF(SplitVT);
	if (!UseLoV2 && !UseHiV2)
	return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
	if (!UseLoV1 && !UseHiV1)
	return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);

	SDValue V1Blend, V2Blend;
	if (UseLoV1 && UseHiV1) {
	V1Blend =
	DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
	} else {
	// We only use half of V1 so map the usage down into the final blend mask.
	V1Blend = UseLoV1 ? LoV1 : HiV1;
	for (int i = 0; i < SplitNumElements; ++i)
	if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
	BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
	}
	if (UseLoV2 && UseHiV2) {
	V2Blend =
	DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
	} else {
	// We only use half of V2 so map the usage down into the final blend mask.
	V2Blend = UseLoV2 ? LoV2 : HiV2;
	for (int i = 0; i < SplitNumElements; ++i)
	if (BlendMask[i] >= SplitNumElements)
	BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
	}
	return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
	};
	SDValue Lo = HalfBlend(LoMask);
	SDValue Hi = HalfBlend(HiMask);
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
	}

	/// \brief Either split a vector in halves or decompose the shuffles and the
	/// blend.
	///
	/// This is provided as a good fallback for many lowerings of non-single-input
	/// shuffles with more than one 128-bit lane. In those cases, we want to select
	/// between splitting the shuffle into 128-bit components and stitching those
	/// back together vs. extracting the single-input shuffles and blending those
	/// results.
	static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
	SDValue V1, SDValue V2,
	ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	assert(!V2.isUndef() && "This routine must not be used to lower single-input "
	"shuffles as it could then recurse on itself.");
	int Size = Mask.size();

	// If this can be modeled as a broadcast of two elements followed by a blend,
	// prefer that lowering. This is especially important because broadcasts can
	// often fold with memory operands.
	auto DoBothBroadcast = [&] {
	int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
	for (int M : Mask)
	if (M >= Size) {
	if (V2BroadcastIdx < 0)
	V2BroadcastIdx = M - Size;
	else if (M - Size != V2BroadcastIdx)
	return false;
	} else if (M >= 0) {
	if (V1BroadcastIdx < 0)
	V1BroadcastIdx = M;
	else if (M != V1BroadcastIdx)
	return false;
	}
	return true;
	};
	if (DoBothBroadcast())
	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
	DAG);

	// If the inputs all stem from a single 128-bit lane of each input, then we
	// split them rather than blending because the split will decompose to
	// unusually few instructions.
	int LaneCount = VT.getSizeInBits() / 128;
	int LaneSize = Size / LaneCount;
	SmallBitVector LaneInputs[2];
	LaneInputs[0].resize(LaneCount, false);
	LaneInputs[1].resize(LaneCount, false);
	for (int i = 0; i < Size; ++i)
	if (Mask[i] >= 0)
	LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
	if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
	return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);

	// Otherwise, just fall back to decomposed shuffles and a blend. This requires
	// that the decomposed single-input shuffles don't end up here.
	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
	}

	/// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
	/// a permutation and blend of those lanes.
	///
	/// This essentially blends the out-of-lane inputs to each lane into the lane
	/// from a permuted copy of the vector. This lowering strategy results in four
	/// instructions in the worst case for a single-input cross lane shuffle which
	/// is lower than any other fully general cross-lane shuffle strategy I'm aware
	/// of. Special cases for each particular shuffle pattern should be handled
	/// prior to trying this lowering.
	static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
	SDValue V1, SDValue V2,
	ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	// FIXME: This should probably be generalized for 512-bit vectors as well.
	assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
	int Size = Mask.size();
	int LaneSize = Size / 2;

	// If there are only inputs from one 128-bit lane, splitting will in fact be
	// less expensive. The flags track whether the given lane contains an element
	// that crosses to another lane.
	bool LaneCrossing[2] = {false, false};
	for (int i = 0; i < Size; ++i)
	if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
	LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
	if (!LaneCrossing[0] \|\| !LaneCrossing[1])
	return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);

	assert(V2.isUndef() &&
	"This last part of this routine only works on single input shuffles");

	SmallVector<int, 32> FlippedBlendMask(Size);
	for (int i = 0; i < Size; ++i)
	FlippedBlendMask[i] =
	Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
	? Mask[i]
	: Mask[i] % LaneSize +
	(i / LaneSize) * LaneSize + Size);

	// Flip the vector, and blend the results which should now be in-lane. The
	// VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and
	// 5 for the high source. The value 3 selects the high half of source 2 and
	// the value 2 selects the low half of source 2. We only use source 2 to
	// allow folding it into a memory operand.
	unsigned PERMMask = 3 \| 2 << 4;
	SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT),
	V1, DAG.getConstant(PERMMask, DL, MVT::i8));
	return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
	}

	/// \brief Handle lowering 2-lane 128-bit shuffles.
	static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SmallVector<int, 4> WidenedMask;
	if (!canWidenShuffleElements(Mask, WidenedMask))
	return SDValue();

	// TODO: If minimizing size and one of the inputs is a zero vector and the
	// the zero vector has only one use, we could use a VPERM2X128 to save the
	// instruction bytes needed to explicitly generate the zero vector.

	// Blends are faster and handle all the non-lane-crossing cases.
	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	bool IsV1Zero = ISD::isBuildVectorAllZeros(V1.getNode());
	bool IsV2Zero = ISD::isBuildVectorAllZeros(V2.getNode());

	// If either input operand is a zero vector, use VPERM2X128 because its mask
	// allows us to replace the zero input with an implicit zero.
	if (!IsV1Zero && !IsV2Zero) {
	// Check for patterns which can be matched with a single insert of a 128-bit
	// subvector.
	bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
	if (OnlyUsesV1 \|\| isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
	// With AVX2, use VPERMQ/VPERMPD to allow memory folding.
	if (Subtarget.hasAVX2() && V2.isUndef())
	return SDValue();

	// With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
	// this will likely become vinsertf128 which can't fold a 256-bit memop.
	if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
	MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
	VT.getVectorNumElements() / 2);
	SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
	DAG.getIntPtrConstant(0, DL));
	SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
	OnlyUsesV1 ? V1 : V2,
	DAG.getIntPtrConstant(0, DL));
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
	}
	}
	}

	// Otherwise form a 128-bit permutation. After accounting for undefs,
	// convert the 64-bit shuffle mask selection values into 128-bit
	// selection bits by dividing the indexes by 2 and shifting into positions
	// defined by a vperm2*128 instruction's immediate control byte.

	// The immediate permute control byte looks like this:
	// [1:0] - select 128 bits from sources for low half of destination
	// [2] - ignore
	// [3] - zero low half of destination
	// [5:4] - select 128 bits from sources for high half of destination
	// [6] - ignore
	// [7] - zero high half of destination

	int MaskLO = WidenedMask[0] < 0 ? 0 : WidenedMask[0];
	int MaskHI = WidenedMask[1] < 0 ? 0 : WidenedMask[1];

	unsigned PermMask = MaskLO \| (MaskHI << 4);

	// If either input is a zero vector, replace it with an undef input.
	// Shuffle mask values < 4 are selecting elements of V1.
	// Shuffle mask values >= 4 are selecting elements of V2.
	// Adjust each half of the permute mask by clearing the half that was
	// selecting the zero vector and setting the zero mask bit.
	if (IsV1Zero) {
	V1 = DAG.getUNDEF(VT);
	if (MaskLO < 2)
	PermMask = (PermMask & 0xf0) \| 0x08;
	if (MaskHI < 2)
	PermMask = (PermMask & 0x0f) \| 0x80;
	}
	if (IsV2Zero) {
	V2 = DAG.getUNDEF(VT);
	if (MaskLO >= 2)
	PermMask = (PermMask & 0xf0) \| 0x08;
	if (MaskHI >= 2)
	PermMask = (PermMask & 0x0f) \| 0x80;
	}

	return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
	DAG.getConstant(PermMask, DL, MVT::i8));
	}

	/// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
	/// shuffling each lane.
	///
	/// This will only succeed when the result of fixing the 128-bit lanes results
	/// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
	/// each 128-bit lanes. This handles many cases where we can quickly blend away
	/// the lane crosses early and then use simpler shuffles within each lane.
	///
	/// FIXME: It might be worthwhile at some point to support this without
	/// requiring the 128-bit lane-relative shuffles to be repeating, but currently
	/// in x86 only floating point has interesting non-repeating shuffles, and even
	/// those are still marginally more expensive.
	static SDValue lowerVectorShuffleByMerging128BitLanes(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	assert(!V2.isUndef() && "This is only useful with multiple inputs.");

	int Size = Mask.size();
	int LaneSize = 128 / VT.getScalarSizeInBits();
	int NumLanes = Size / LaneSize;
	assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");

	// See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
	// check whether the in-128-bit lane shuffles share a repeating pattern.
	SmallVector<int, 4> Lanes((unsigned)NumLanes, -1);
	SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1);
	for (int i = 0; i < Size; ++i) {
	if (Mask[i] < 0)
	continue;

	int j = i / LaneSize;

	if (Lanes[j] < 0) {
	// First entry we've seen for this lane.
	Lanes[j] = Mask[i] / LaneSize;
	} else if (Lanes[j] != Mask[i] / LaneSize) {
	// This doesn't match the lane selected previously!
	return SDValue();
	}

	// Check that within each lane we have a consistent shuffle mask.
	int k = i % LaneSize;
	if (InLaneMask[k] < 0) {
	InLaneMask[k] = Mask[i] % LaneSize;
	} else if (InLaneMask[k] != Mask[i] % LaneSize) {
	// This doesn't fit a repeating in-lane mask.
	return SDValue();
	}
	}

	// First shuffle the lanes into place.
	MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
	VT.getSizeInBits() / 64);
	SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1);
	for (int i = 0; i < NumLanes; ++i)
	if (Lanes[i] >= 0) {
	LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
	LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
	}

	V1 = DAG.getBitcast(LaneVT, V1);
	V2 = DAG.getBitcast(LaneVT, V2);
	SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);

	// Cast it back to the type we actually want.
	LaneShuffle = DAG.getBitcast(VT, LaneShuffle);

	// Now do a simple shuffle that isn't lane crossing.
	SmallVector<int, 8> NewMask((unsigned)Size, -1);
	for (int i = 0; i < Size; ++i)
	if (Mask[i] >= 0)
	NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
	assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
	"Must not introduce lane crosses at this point!");

	return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
	}

	/// Lower shuffles where an entire half of a 256-bit vector is UNDEF.
	/// This allows for fast cases such as subvector extraction/insertion
	/// or shuffling smaller vector types which can lower more efficiently.
	static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
	SDValue V1, SDValue V2,
	ArrayRef<int> Mask,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(VT.is256BitVector() && "Expected 256-bit vector");

	unsigned NumElts = VT.getVectorNumElements();
	unsigned HalfNumElts = NumElts / 2;
	MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);

	bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);
	bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);
	if (!UndefLower && !UndefUpper)
	return SDValue();

	// Upper half is undef and lower half is whole upper subvector.
	// e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
	if (UndefUpper &&
	isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
	SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
	DAG.getIntPtrConstant(HalfNumElts, DL));
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
	DAG.getIntPtrConstant(0, DL));
	}

	// Lower half is undef and upper half is whole lower subvector.
	// e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
	if (UndefLower &&
	isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
	SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
	DAG.getIntPtrConstant(0, DL));
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
	DAG.getIntPtrConstant(HalfNumElts, DL));
	}

	// If the shuffle only uses two of the four halves of the input operands,
	// then extract them and perform the 'half' shuffle at half width.
	// e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
	int HalfIdx1 = -1, HalfIdx2 = -1;
	SmallVector<int, 8> HalfMask(HalfNumElts);
	unsigned Offset = UndefLower ? HalfNumElts : 0;
	for (unsigned i = 0; i != HalfNumElts; ++i) {
	int M = Mask[i + Offset];
	if (M < 0) {
	HalfMask[i] = M;
	continue;
	}

	// Determine which of the 4 half vectors this element is from.
	// i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
	int HalfIdx = M / HalfNumElts;

	// Determine the element index into its half vector source.
	int HalfElt = M % HalfNumElts;

	// We can shuffle with up to 2 half vectors, set the new 'half'
	// shuffle mask accordingly.
	if (HalfIdx1 < 0 \|\| HalfIdx1 == HalfIdx) {
	HalfMask[i] = HalfElt;
	HalfIdx1 = HalfIdx;
	continue;
	}
	if (HalfIdx2 < 0 \|\| HalfIdx2 == HalfIdx) {
	HalfMask[i] = HalfElt + HalfNumElts;
	HalfIdx2 = HalfIdx;
	continue;
	}

	// Too many half vectors referenced.
	return SDValue();
	}
	assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");

	// Only shuffle the halves of the inputs when useful.
	int NumLowerHalves =
	(HalfIdx1 == 0 \|\| HalfIdx1 == 2) + (HalfIdx2 == 0 \|\| HalfIdx2 == 2);
	int NumUpperHalves =
	(HalfIdx1 == 1 \|\| HalfIdx1 == 3) + (HalfIdx2 == 1 \|\| HalfIdx2 == 3);

	// uuuuXXXX - don't extract uppers just to insert again.
	if (UndefLower && NumUpperHalves != 0)
	return SDValue();

	// XXXXuuuu - don't extract both uppers, instead shuffle and then extract.
	if (UndefUpper && NumUpperHalves == 2)
	return SDValue();

	// AVX2 - XXXXuuuu - always extract lowers.
	if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) {
	// AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
	if (VT == MVT::v4f64 \|\| VT == MVT::v4i64)
	return SDValue();
	// AVX2 supports variable 32-bit element cross-lane shuffles.
	if (VT == MVT::v8f32 \|\| VT == MVT::v8i32) {
	// XXXXuuuu - don't extract lowers and uppers.
	if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0)
	return SDValue();
	}
	}

	auto GetHalfVector = [&](int HalfIdx) {
	if (HalfIdx < 0)
	return DAG.getUNDEF(HalfVT);
	SDValue V = (HalfIdx < 2 ? V1 : V2);
	HalfIdx = (HalfIdx % 2) * HalfNumElts;
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
	DAG.getIntPtrConstant(HalfIdx, DL));
	};

	SDValue Half1 = GetHalfVector(HalfIdx1);
	SDValue Half2 = GetHalfVector(HalfIdx2);
	SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
	DAG.getIntPtrConstant(Offset, DL));
	}

	/// \brief Test whether the specified input (0 or 1) is in-place blended by the
	/// given mask.
	///
	/// This returns true if the elements from a particular input are already in the
	/// slot required by the given mask and require no permutation.
	static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
	assert((Input == 0 \|\| Input == 1) && "Only two inputs to shuffles.");
	int Size = Mask.size();
	for (int i = 0; i < Size; ++i)
	if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
	return false;

	return true;
	}

	/// Handle case where shuffle sources are coming from the same 128-bit lane and
	/// every lane can be represented as the same repeating mask - allowing us to
	/// shuffle the sources with the repeating shuffle and then permute the result
	/// to the destination lanes.
	static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	int NumElts = VT.getVectorNumElements();
	int NumLanes = VT.getSizeInBits() / 128;
	int NumLaneElts = NumElts / NumLanes;

	// On AVX2 we may be able to just shuffle the lowest elements and then
	// broadcast the result.
	if (Subtarget.hasAVX2()) {
	for (unsigned BroadcastSize : {16, 32, 64}) {
	if (BroadcastSize <= VT.getScalarSizeInBits())
	continue;
	int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();

	// Attempt to match a repeating pattern every NumBroadcastElts,
	// accounting for UNDEFs but only references the lowest 128-bit
	// lane of the inputs.
	auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
	for (int i = 0; i != NumElts; i += NumBroadcastElts)
	for (int j = 0; j != NumBroadcastElts; ++j) {
	int M = Mask[i + j];
	if (M < 0)
	continue;
	int &R = RepeatMask[j];
	if (0 != ((M % NumElts) / NumLaneElts))
	return false;
	if (0 <= R && R != M)
	return false;
	R = M;
	}
	return true;
	};

	SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
	if (!FindRepeatingBroadcastMask(RepeatMask))
	continue;

	// Shuffle the (lowest) repeated elements in place for broadcast.
	SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);

	// Shuffle the actual broadcast.
	SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
	for (int i = 0; i != NumElts; i += NumBroadcastElts)
	for (int j = 0; j != NumBroadcastElts; ++j)
	BroadcastMask[i + j] = j;
	return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
	BroadcastMask);
	}
	}

	// Bail if the shuffle mask doesn't cross 128-bit lanes.
	if (!is128BitLaneCrossingShuffleMask(VT, Mask))
	return SDValue();

	// Bail if we already have a repeated lane shuffle mask.
	SmallVector<int, 8> RepeatedShuffleMask;
	if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
	return SDValue();

	// On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
	// (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
	int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
	int NumSubLanes = NumLanes * SubLaneScale;
	int NumSubLaneElts = NumLaneElts / SubLaneScale;

	// Check that all the sources are coming from the same lane and see if we can
	// form a repeating shuffle mask (local to each sub-lane). At the same time,
	// determine the source sub-lane for each destination sub-lane.
	int TopSrcSubLane = -1;
	SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
	SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
	SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
	SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};

	for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
	// Extract the sub-lane mask, check that it all comes from the same lane
	// and normalize the mask entries to come from the first lane.
	int SrcLane = -1;
	SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
	for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
	int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
	if (M < 0)
	continue;
	int Lane = (M % NumElts) / NumLaneElts;
	if ((0 <= SrcLane) && (SrcLane != Lane))
	return SDValue();
	SrcLane = Lane;
	int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
	SubLaneMask[Elt] = LocalM;
	}

	// Whole sub-lane is UNDEF.
	if (SrcLane < 0)
	continue;

	// Attempt to match against the candidate repeated sub-lane masks.
	for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
	auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
	for (int i = 0; i != NumSubLaneElts; ++i) {
	if (M1[i] < 0 \|\| M2[i] < 0)
	continue;
	if (M1[i] != M2[i])
	return false;
	}
	return true;
	};

	auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
	if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
	continue;

	// Merge the sub-lane mask into the matching repeated sub-lane mask.
	for (int i = 0; i != NumSubLaneElts; ++i) {
	int M = SubLaneMask[i];
	if (M < 0)
	continue;
	assert((RepeatedSubLaneMask[i] < 0 \|\| RepeatedSubLaneMask[i] == M) &&
	"Unexpected mask element");
	RepeatedSubLaneMask[i] = M;
	}

	// Track the top most source sub-lane - by setting the remaining to UNDEF
	// we can greatly simplify shuffle matching.
	int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
	TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
	Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
	break;
	}

	// Bail if we failed to find a matching repeated sub-lane mask.
	if (Dst2SrcSubLanes[DstSubLane] < 0)
	return SDValue();
	}
	assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
	"Unexpected source lane");

	// Create a repeating shuffle mask for the entire vector.
	SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
	for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
	int Lane = SubLane / SubLaneScale;
	auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
	for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
	int M = RepeatedSubLaneMask[Elt];
	if (M < 0)
	continue;
	int Idx = (SubLane * NumSubLaneElts) + Elt;
	RepeatedMask[Idx] = M + (Lane * NumLaneElts);
	}
	}
	SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);

	// Shuffle each source sub-lane to its destination.
	SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
	for (int i = 0; i != NumElts; i += NumSubLaneElts) {
	int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
	if (SrcSubLane < 0)
	continue;
	for (int j = 0; j != NumSubLaneElts; ++j)
	SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
	}

	return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
	SubLaneMask);
	}

	static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
	unsigned &ShuffleImm,
	ArrayRef<int> Mask) {
	int NumElts = VT.getVectorNumElements();
	assert(VT.getScalarSizeInBits() == 64 &&
	(NumElts == 2 \|\| NumElts == 4 \|\| NumElts == 8) &&
	"Unexpected data type for VSHUFPD");

	// Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
	// Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
	ShuffleImm = 0;
	bool ShufpdMask = true;
	bool CommutableMask = true;
	for (int i = 0; i < NumElts; ++i) {
	if (Mask[i] == SM_SentinelUndef)
	continue;
	if (Mask[i] < 0)
	return false;
	int Val = (i & 6) + NumElts * (i & 1);
	int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
	if (Mask[i] < Val \|\| Mask[i] > Val + 1)
	ShufpdMask = false;
	if (Mask[i] < CommutVal \|\| Mask[i] > CommutVal + 1)
	CommutableMask = false;
	ShuffleImm \|= (Mask[i] % 2) << i;
	}

	if (ShufpdMask)
	return true;
	if (CommutableMask) {
	std::swap(V1, V2);
	return true;
	}

	return false;
	}

	static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
	ArrayRef<int> Mask, SDValue V1,
	SDValue V2, SelectionDAG &DAG) {
	assert((VT == MVT::v2f64 \|\| VT == MVT::v4f64 \|\| VT == MVT::v8f64)&&
	"Unexpected data type for VSHUFPD");

	unsigned Immediate = 0;
	if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
	return SDValue();

	return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
	DAG.getConstant(Immediate, DL, MVT::i8));
	}

	static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
	ArrayRef<int> Mask, SDValue V1,
	SDValue V2, SelectionDAG &DAG) {
	MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
	MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());

	SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
	if (V2.isUndef())
	return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);

	return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
	}

	/// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
	///
	/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
	/// isn't available.
	static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
	assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");

	if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return V;

	if (V2.isUndef()) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
	DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
	return Broadcast;

	// Use low duplicate instructions for masks that match their pattern.
	if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
	return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);

	if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
	// Non-half-crossing single input shuffles can be lowered with an
	// interleaved permutation.
	unsigned VPERMILPMask = (Mask[0] == 1) \| ((Mask[1] == 1) << 1) \|
	((Mask[2] == 3) << 2) \| ((Mask[3] == 3) << 3);
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
	DAG.getConstant(VPERMILPMask, DL, MVT::i8));
	}

	// With AVX2 we have direct support for this permutation.
	if (Subtarget.hasAVX2())
	return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// the results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
	return V;

	// Otherwise, fall back.
	return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
	DAG);
	}

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
	return V;

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Check if the blend happens to exactly fit that of SHUFPD.
	if (SDValue Op =
	lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
	return Op;

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// the results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
	return V;

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle. However, if we have AVX2 and either inputs are already in place,
	// we will be able to shuffle even across lanes the other input in a single
	// instruction so skip this pattern.
	if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) \|\|
	isShuffleMaskInputInPlace(1, Mask))))
	if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
	DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
	return Result;
	// If we have VLX support, we can use VEXPAND.
	if (Subtarget.hasVLX())
	if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask,
	V1, V2, DAG, Subtarget))
	return V;

	// If we have AVX2 then we always want to lower with a blend because an v4 we
	// can fully permute the elements.
	if (Subtarget.hasAVX2())
	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
	Mask, DAG);

	// Otherwise fall back on generic lowering.
	return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
	}

	/// \brief Handle lowering of 4-lane 64-bit integer shuffles.
	///
	/// This routine is only called when we have AVX2 and thus a reasonable
	/// instruction set for v4i64 shuffling..
	static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
	assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
	assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");

	if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return V;

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;

	if (V2.isUndef()) {
	// When the shuffle is mirrored between the 128-bit lanes of the unit, we
	// can use lower latency instructions that will operate on both lanes.
	SmallVector<int, 2> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
	SmallVector<int, 4> PSHUFDMask;
	scaleShuffleMask(2, RepeatedMask, PSHUFDMask);
	return DAG.getBitcast(
	MVT::v4i64,
	DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
	DAG.getBitcast(MVT::v8i32, V1),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
	}

	// AVX2 provides a direct instruction for permuting a single input across
	// lanes.
	return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
	}

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// If we have VLX support, we can use VALIGN or VEXPAND.
	if (Subtarget.hasVLX()) {
	if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2,
	Mask, Subtarget, DAG))
	return Rotate;

	if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask,
	V1, V2, DAG, Subtarget))
	return V;
	}

	// Try to use PALIGNR.
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2,
	Mask, Subtarget, DAG))
	return Rotate;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
	return V;

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle. However, if we have AVX2 and either inputs are already in place,
	// we will be able to shuffle even across lanes the other input in a single
	// instruction so skip this pattern.
	if (!isShuffleMaskInputInPlace(0, Mask) &&
	!isShuffleMaskInputInPlace(1, Mask))
	if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
	DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
	return Result;

	// Otherwise fall back on generic blend lowering.
	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
	Mask, DAG);
	}

	/// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
	///
	/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
	/// isn't available.
	static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
	assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;

	// If the shuffle mask is repeated in each 128-bit lane, we have many more
	// options to efficiently lower the shuffle.
	SmallVector<int, 4> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
	assert(RepeatedMask.size() == 4 &&
	"Repeated masks must be half the mask width!");

	// Use even/odd duplicate instructions for masks that match their pattern.
	if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
	return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
	if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
	return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);

	if (V2.isUndef())
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
	getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
	return V;

	// Otherwise, fall back to a SHUFPS sequence. Here it is important that we
	// have already handled any direct blends.
	return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
	}

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// the results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
	return V;

	// If we have a single input shuffle with different shuffle patterns in the
	// two 128-bit lanes use the variable mask to VPERMILPS.
	if (V2.isUndef()) {
	SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
	if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
	return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);

	if (Subtarget.hasAVX2())
	return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);

	// Otherwise, fall back.
	return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
	DAG);
	}

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle.
	if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
	DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
	return Result;
	// If we have VLX support, we can use VEXPAND.
	if (Subtarget.hasVLX())
	if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask,
	V1, V2, DAG, Subtarget))
	return V;

	// For non-AVX512 if the Mask is of 16bit elements in lane then try to split
	// since after split we get a more efficient code using vpunpcklwd and
	// vpunpckhwd instrs than vblend.
	if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
	if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2,
	Mask, DAG))
	return V;

	// If we have AVX2 then we always want to lower with a blend because at v8 we
	// can fully permute the elements.
	if (Subtarget.hasAVX2())
	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
	Mask, DAG);

	// Otherwise fall back on generic lowering.
	return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
	}

	/// \brief Handle lowering of 8-lane 32-bit integer shuffles.
	///
	/// This routine is only called when we have AVX2 and thus a reasonable
	/// instruction set for v8i32 shuffling..
	static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
	assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
	assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
	DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	// For non-AVX512 if the Mask is of 16bit elements in lane then try to split
	// since after split we get a more efficient code than vblend by using
	// vpunpcklwd and vpunpckhwd instrs.
	if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
	!Subtarget.hasAVX512())
	if (SDValue V =
	lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, DAG))
	return V;

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;

	// If the shuffle mask is repeated in each 128-bit lane we can use more
	// efficient instructions that mirror the shuffles across the two 128-bit
	// lanes.
	SmallVector<int, 4> RepeatedMask;
	bool Is128BitLaneRepeatedShuffle =
	is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
	if (Is128BitLaneRepeatedShuffle) {
	assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
	if (V2.isUndef())
	return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
	getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
	return V;
	}

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// If we have VLX support, we can use VALIGN or EXPAND.
	if (Subtarget.hasVLX()) {
	if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2,
	Mask, Subtarget, DAG))
	return Rotate;

	if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask,
	V1, V2, DAG, Subtarget))
	return V;
	}

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
	DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
	return Rotate;

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
	return V;

	// If the shuffle patterns aren't repeated but it is a single input, directly
	// generate a cross-lane VPERMD instruction.
	if (V2.isUndef()) {
	SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
	return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
	}

	// Assume that a single SHUFPS is faster than an alternative sequence of
	// multiple instructions (even if the CPU has a domain penalty).
	// If some CPU is harmed by the domain switch, we can fix it in a later pass.
	if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
	SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
	SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
	SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
	CastV1, CastV2, DAG);
	return DAG.getBitcast(MVT::v8i32, ShufPS);
	}

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle.
	if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
	DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
	return Result;

	// Otherwise fall back on generic blend lowering.
	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
	Mask, DAG);
	}

	/// \brief Handle lowering of 16-lane 16-bit integer shuffles.
	///
	/// This routine is only called when we have AVX2 and thus a reasonable
	/// instruction set for v16i16 shuffling..
	static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
	assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
	assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
	DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
	return V;

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
	DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
	return Rotate;

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// the results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
	return V;

	if (V2.isUndef()) {
	// There are no generalized cross-lane shuffle operations available on i16
	// element types.
	if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
	return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
	Mask, DAG);

	SmallVector<int, 8> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
	// As this is a single-input shuffle, the repeated mask should be
	// a strictly valid v8i16 mask that we can pass through to the v8i16
	// lowering to handle even the v16 case.
	return lowerV8I16GeneralSingleInputVectorShuffle(
	DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
	}
	}

	if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
	DL, MVT::v16i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
	return PSHUFB;

	// AVX512BWVL can lower to VPERMW.
	if (Subtarget.hasBWI() && Subtarget.hasVLX())
	return lowerVectorShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle.
	if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
	DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
	return Result;

	// Otherwise fall back on generic lowering.
	return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
	}

	/// \brief Handle lowering of 32-lane 8-bit integer shuffles.
	///
	/// This routine is only called when we have AVX2 and thus a reasonable
	/// instruction set for v32i8 shuffling..
	static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
	assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
	assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
	DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
	return V;

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
	DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
	return Rotate;

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// the results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
	return V;

	// There are no generalized cross-lane shuffle operations available on i8
	// element types.
	if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
	return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
	DAG);

	if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
	DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
	return PSHUFB;

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle.
	if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
	DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
	return Result;

	// Otherwise fall back on generic lowering.
	return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
	}

	/// \brief High-level routine to lower various 256-bit x86 vector shuffles.
	///
	/// This routine either breaks down the specific type of a 256-bit x86 vector
	/// shuffle or splits it into two 128-bit shuffles and fuses the results back
	/// together based on the available instructions.
	static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	MVT VT, SDValue V1, SDValue V2,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	// If we have a single input to the zero element, insert that into V1 if we
	// can do so cheaply.
	int NumElts = VT.getVectorNumElements();
	int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });

	if (NumV2Elements == 1 && Mask[0] >= NumElts)
	if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
	DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return Insertion;

	// Handle special cases where the lower or upper half is UNDEF.
	if (SDValue V =
	lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
	return V;

	// There is a really nice hard cut-over between AVX1 and AVX2 that means we
	// can check for those subtargets here and avoid much of the subtarget
	// querying in the per-vector-type lowering routines. With AVX1 we have
	// essentially zero ability to manipulate a 256-bit vector with integer
	// types. Since we'll use floating point types there eventually, just
	// immediately cast everything to a float and operate entirely in that domain.
	if (VT.isInteger() && !Subtarget.hasAVX2()) {
	int ElementBits = VT.getScalarSizeInBits();
	if (ElementBits < 32) {
	// No floating point type available, if we can't use the bit operations
	// for masking/blending then decompose into 128-bit vectors.
	if (SDValue V =
	lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
	return V;
	if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
	return V;
	return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
	}

	MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
	VT.getVectorNumElements());
	V1 = DAG.getBitcast(FpVT, V1);
	V2 = DAG.getBitcast(FpVT, V2);
	return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
	}

	switch (VT.SimpleTy) {
	case MVT::v4f64:
	return lowerV4F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v4i64:
	return lowerV4I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v8f32:
	return lowerV8F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v8i32:
	return lowerV8I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v16i16:
	return lowerV16I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v32i8:
	return lowerV32I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

	default:
	llvm_unreachable("Not a valid 256-bit x86 vector type!");
	}
	}

	/// \brief Try to lower a vector shuffle as a 128-bit shuffles.
	static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
	ArrayRef<int> Mask, SDValue V1,
	SDValue V2, SelectionDAG &DAG) {
	assert(VT.getScalarSizeInBits() == 64 &&
	"Unexpected element type size for 128bit shuffle.");

	// To handle 256 bit vector requires VLX and most probably
	// function lowerV2X128VectorShuffle() is better solution.
	assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");

	SmallVector<int, 4> WidenedMask;
	if (!canWidenShuffleElements(Mask, WidenedMask))
	return SDValue();

	// Check for patterns which can be matched with a single insert of a 256-bit
	// subvector.
	bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,
	{0, 1, 2, 3, 0, 1, 2, 3});
	if (OnlyUsesV1 \|\| isShuffleEquivalent(V1, V2, Mask,
	{0, 1, 2, 3, 8, 9, 10, 11})) {
	MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
	SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
	DAG.getIntPtrConstant(0, DL));
	SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
	OnlyUsesV1 ? V1 : V2,
	DAG.getIntPtrConstant(0, DL));
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
	}

	assert(WidenedMask.size() == 4);

	// See if this is an insertion of the lower 128-bits of V2 into V1.
	bool IsInsert = true;
	int V2Index = -1;
	for (int i = 0; i < 4; ++i) {
	assert(WidenedMask[i] >= -1);
	if (WidenedMask[i] < 0)
	continue;

	// Make sure all V1 subvectors are in place.
	if (WidenedMask[i] < 4) {
	if (WidenedMask[i] != i) {
	IsInsert = false;
	break;
	}
	} else {
	// Make sure we only have a single V2 index and its the lowest 128-bits.
	if (V2Index >= 0 \|\| WidenedMask[i] != 4) {
	IsInsert = false;
	break;
	}
	V2Index = i;
	}
	}
	if (IsInsert && V2Index >= 0) {
	MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
	SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
	DAG.getIntPtrConstant(0, DL));
	return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
	}

	// Try to lower to to vshuf64x2/vshuf32x4.
	SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
	unsigned PermMask = 0;
	// Insure elements came from the same Op.
	for (int i = 0; i < 4; ++i) {
	assert(WidenedMask[i] >= -1);
	if (WidenedMask[i] < 0)
	continue;

	SDValue Op = WidenedMask[i] >= 4 ? V2 : V1;
	unsigned OpIndex = i / 2;
	if (Ops[OpIndex].isUndef())
	Ops[OpIndex] = Op;
	else if (Ops[OpIndex] != Op)
	return SDValue();

	// Convert the 128-bit shuffle mask selection values into 128-bit selection
	// bits defined by a vshuf64x2 instruction's immediate control byte.
	PermMask \|= (WidenedMask[i] % 4) << (i * 2);
	}

	return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
	DAG.getConstant(PermMask, DL, MVT::i8));
	}

	/// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
	static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
	assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");

	if (V2.isUndef()) {
	// Use low duplicate instructions for masks that match their pattern.
	if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
	return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);

	if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
	// Non-half-crossing single input shuffles can be lowered with an
	// interleaved permutation.
	unsigned VPERMILPMask = (Mask[0] == 1) \| ((Mask[1] == 1) << 1) \|
	((Mask[2] == 3) << 2) \| ((Mask[3] == 3) << 3) \|
	((Mask[4] == 5) << 4) \| ((Mask[5] == 5) << 5) \|
	((Mask[6] == 7) << 6) \| ((Mask[7] == 7) << 7);
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
	DAG.getConstant(VPERMILPMask, DL, MVT::i8));
	}

	SmallVector<int, 4> RepeatedMask;
	if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
	return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
	getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
	}

	if (SDValue Shuf128 =
	lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG))
	return Shuf128;

	if (SDValue Unpck =
	lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
	return Unpck;

	// Check if the blend happens to exactly fit that of SHUFPD.
	if (SDValue Op =
	lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
	return Op;

	if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1,
	V2, DAG, Subtarget))
	return V;

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
	}

	/// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
	static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
	assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");

	// If the shuffle mask is repeated in each 128-bit lane, we have many more
	// options to efficiently lower the shuffle.
	SmallVector<int, 4> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
	assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");

	// Use even/odd duplicate instructions for masks that match their pattern.
	if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
	return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
	if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
	return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);

	if (V2.isUndef())
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
	getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue Unpck =
	lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
	return Unpck;

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Otherwise, fall back to a SHUFPS sequence.
	return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
	}
	// If we have AVX512F support, we can use VEXPAND.
	if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
	V1, V2, DAG, Subtarget))
	return V;

	return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
	}

	/// \brief Handle lowering of 8-lane 64-bit integer shuffles.
	static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
	assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");

	if (SDValue Shuf128 =
	lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
	return Shuf128;

	if (V2.isUndef()) {
	// When the shuffle is mirrored between the 128-bit lanes of the unit, we
	// can use lower latency instructions that will operate on all four
	// 128-bit lanes.
	SmallVector<int, 2> Repeated128Mask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
	SmallVector<int, 4> PSHUFDMask;
	scaleShuffleMask(2, Repeated128Mask, PSHUFDMask);
	return DAG.getBitcast(
	MVT::v8i64,
	DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
	DAG.getBitcast(MVT::v16i32, V1),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
	}

	SmallVector<int, 4> Repeated256Mask;
	if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
	return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
	getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
	}

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use VALIGN.
	if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i64, V1, V2,
	Mask, Subtarget, DAG))
	return Rotate;

	// Try to use PALIGNR.
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i64, V1, V2,
	Mask, Subtarget, DAG))
	return Rotate;

	if (SDValue Unpck =
	lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
	return Unpck;
	// If we have AVX512F support, we can use VEXPAND.
	if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1,
	V2, DAG, Subtarget))
	return V;

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
	}

	/// \brief Handle lowering of 16-lane 32-bit integer shuffles.
	static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
	assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
	DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	// If the shuffle mask is repeated in each 128-bit lane we can use more
	// efficient instructions that mirror the shuffles across the four 128-bit
	// lanes.
	SmallVector<int, 4> RepeatedMask;
	bool Is128BitLaneRepeatedShuffle =
	is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
	if (Is128BitLaneRepeatedShuffle) {
	assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
	if (V2.isUndef())
	return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
	getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
	return V;
	}

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use VALIGN.
	if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v16i32, V1, V2,
	Mask, Subtarget, DAG))
	return Rotate;

	// Try to use byte rotation instructions.
	if (Subtarget.hasBWI())
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
	DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
	return Rotate;

	// Assume that a single SHUFPS is faster than using a permv shuffle.
	// If some CPU is harmed by the domain switch, we can fix it in a later pass.
	if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
	SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
	SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
	SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
	CastV1, CastV2, DAG);
	return DAG.getBitcast(MVT::v16i32, ShufPS);
	}
	// If we have AVX512F support, we can use VEXPAND.
	if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask,
	V1, V2, DAG, Subtarget))
	return V;

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;
	return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
	}

	/// \brief Handle lowering of 32-lane 16-bit integer shuffles.
	static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
	assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
	assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
	DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
	return V;

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
	DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
	return Rotate;

	if (V2.isUndef()) {
	SmallVector<int, 8> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
	// As this is a single-input shuffle, the repeated mask should be
	// a strictly valid v8i16 mask that we can pass through to the v8i16
	// lowering to handle even the v32 case.
	return lowerV8I16GeneralSingleInputVectorShuffle(
	DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
	}
	}

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
	}

	/// \brief Handle lowering of 64-lane 8-bit integer shuffles.
	static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
	assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
	assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
	DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
	return V;

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
	DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
	return Rotate;

	if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
	DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
	return PSHUFB;

	// VBMI can use VPERMV/VPERMV3 byte shuffles.
	if (Subtarget.hasVBMI())
	return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// the results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
	return V;

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// FIXME: Implement direct support for this type!
	return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
	}

	/// \brief High-level routine to lower various 512-bit x86 vector shuffles.
	///
	/// This routine either breaks down the specific type of a 512-bit x86 vector
	/// shuffle or splits it into two 256-bit shuffles and fuses the results back
	/// together based on the available instructions.
	static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	MVT VT, SDValue V1, SDValue V2,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.hasAVX512() &&
	"Cannot lower 512-bit vectors w/ basic ISA!");

	// If we have a single input to the zero element, insert that into V1 if we
	// can do so cheaply.
	int NumElts = Mask.size();
	int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });

	if (NumV2Elements == 1 && Mask[0] >= NumElts)
	if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
	DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return Insertion;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast =
	lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
	return Broadcast;

	// Dispatch to each element type for lowering. If we don't have support for
	// specific element type shuffles at 512 bits, immediately split them and
	// lower them. Each lowering routine of a given type is allowed to assume that
	// the requisite ISA extensions for that element type are available.
	switch (VT.SimpleTy) {
	case MVT::v8f64:
	return lowerV8F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v16f32:
	return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v8i64:
	return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v16i32:
	return lowerV16I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v32i16:
	return lowerV32I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v64i8:
	return lowerV64I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

	default:
	llvm_unreachable("Not a valid 512-bit x86 vector type!");
	}
	}

	// Lower vXi1 vector shuffles.
	// There is no a dedicated instruction on AVX-512 that shuffles the masks.
	// The only way to shuffle bits is to sign-extend the mask vector to SIMD
	// vector, shuffle and then truncate it back.
	static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	MVT VT, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.hasAVX512() &&
	"Cannot lower 512-bit vectors w/o basic ISA!");
	MVT ExtVT;
	switch (VT.SimpleTy) {
	default:
	llvm_unreachable("Expected a vector of i1 elements");
	case MVT::v2i1:
	ExtVT = MVT::v2i64;
	break;
	case MVT::v4i1:
	ExtVT = MVT::v4i32;
	break;
	case MVT::v8i1:
	ExtVT = MVT::v8i64; // Take 512-bit type, more shuffles on KNL
	break;
	case MVT::v16i1:
	ExtVT = MVT::v16i32;
	break;
	case MVT::v32i1:
	ExtVT = MVT::v32i16;
	break;
	case MVT::v64i1:
	ExtVT = MVT::v64i8;
	break;
	}

	if (ISD::isBuildVectorAllZeros(V1.getNode()))
	V1 = getZeroVector(ExtVT, Subtarget, DAG, DL);
	else if (ISD::isBuildVectorAllOnes(V1.getNode()))
	V1 = getOnesVector(ExtVT, DAG, DL);
	else
	V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);

	if (V2.isUndef())
	V2 = DAG.getUNDEF(ExtVT);
	else if (ISD::isBuildVectorAllZeros(V2.getNode()))
	V2 = getZeroVector(ExtVT, Subtarget, DAG, DL);
	else if (ISD::isBuildVectorAllOnes(V2.getNode()))
	V2 = getOnesVector(ExtVT, DAG, DL);
	else
	V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);

	SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
	// i1 was sign extended we can use X86ISD::CVT2MASK.
	int NumElems = VT.getVectorNumElements();
	if ((Subtarget.hasBWI() && (NumElems >= 32)) \|\|
	(Subtarget.hasDQI() && (NumElems < 32)))
	return DAG.getNode(X86ISD::CVT2MASK, DL, VT, Shuffle);

	return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
	}

	/// Helper function that returns true if the shuffle mask should be
	/// commuted to improve canonicalization.
	static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
	int NumElements = Mask.size();

	int NumV1Elements = 0, NumV2Elements = 0;
	for (int M : Mask)
	if (M < 0)
	continue;
	else if (M < NumElements)
	++NumV1Elements;
	else
	++NumV2Elements;

	// Commute the shuffle as needed such that more elements come from V1 than
	// V2. This allows us to match the shuffle pattern strictly on how many
	// elements come from V1 without handling the symmetric cases.
	if (NumV2Elements > NumV1Elements)
	return true;

	assert(NumV1Elements > 0 && "No V1 indices");

	if (NumV2Elements == 0)
	return false;

	// When the number of V1 and V2 elements are the same, try to minimize the
	// number of uses of V2 in the low half of the vector. When that is tied,
	// ensure that the sum of indices for V1 is equal to or lower than the sum
	// indices for V2. When those are equal, try to ensure that the number of odd
	// indices for V1 is lower than the number of odd indices for V2.
	if (NumV1Elements == NumV2Elements) {
	int LowV1Elements = 0, LowV2Elements = 0;
	for (int M : Mask.slice(0, NumElements / 2))
	if (M >= NumElements)
	++LowV2Elements;
	else if (M >= 0)
	++LowV1Elements;
	if (LowV2Elements > LowV1Elements)
	return true;
	if (LowV2Elements == LowV1Elements) {
	int SumV1Indices = 0, SumV2Indices = 0;
	for (int i = 0, Size = Mask.size(); i < Size; ++i)
	if (Mask[i] >= NumElements)
	SumV2Indices += i;
	else if (Mask[i] >= 0)
	SumV1Indices += i;
	if (SumV2Indices < SumV1Indices)
	return true;
	if (SumV2Indices == SumV1Indices) {
	int NumV1OddIndices = 0, NumV2OddIndices = 0;
	for (int i = 0, Size = Mask.size(); i < Size; ++i)
	if (Mask[i] >= NumElements)
	NumV2OddIndices += i % 2;
	else if (Mask[i] >= 0)
	NumV1OddIndices += i % 2;
	if (NumV2OddIndices < NumV1OddIndices)
	return true;
	}
	}
	}

	return false;
	}

	/// \brief Top-level lowering for x86 vector shuffles.
	///
	/// This handles decomposition, canonicalization, and lowering of all x86
	/// vector shuffles. Most of the specific lowering strategies are encapsulated
	/// above in helper routines. The canonicalization attempts to widen shuffles
	/// to involve fewer lanes of wider elements, consolidate symmetric patterns
	/// s.t. only one of the two inputs needs to be tested, etc.
	static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
	ArrayRef<int> Mask = SVOp->getMask();
	SDValue V1 = Op.getOperand(0);
	SDValue V2 = Op.getOperand(1);
	MVT VT = Op.getSimpleValueType();
	int NumElements = VT.getVectorNumElements();
	SDLoc DL(Op);
	bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);

	assert((VT.getSizeInBits() != 64 \|\| Is1BitVector) &&
	"Can't lower MMX shuffles");

	bool V1IsUndef = V1.isUndef();
	bool V2IsUndef = V2.isUndef();
	if (V1IsUndef && V2IsUndef)
	return DAG.getUNDEF(VT);

	// When we create a shuffle node we put the UNDEF node to second operand,
	// but in some cases the first operand may be transformed to UNDEF.
	// In this case we should just commute the node.
	if (V1IsUndef)
	return DAG.getCommutedVectorShuffle(*SVOp);

	// Check for non-undef masks pointing at an undef vector and make the masks
	// undef as well. This makes it easier to match the shuffle based solely on
	// the mask.
	if (V2IsUndef)
	for (int M : Mask)
	if (M >= NumElements) {
	SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
	for (int &M : NewMask)
	if (M >= NumElements)
	M = -1;
	return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
	}

	// Check for illegal shuffle mask element index values.
	int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit;
	assert(llvm::all_of(Mask,
	[&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
	"Out of bounds shuffle index");

	// We actually see shuffles that are entirely re-arrangements of a set of
	// zero inputs. This mostly happens while decomposing complex shuffles into
	// simple ones. Directly lower these as a buildvector of zeros.
	APInt Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
	if (Zeroable.isAllOnesValue())
	return getZeroVector(VT, Subtarget, DAG, DL);

	// Try to collapse shuffles into using a vector type with fewer elements but
	// wider element types. We cap this to not form integers or floating point
	// elements wider than 64 bits, but it might be interesting to form i128
	// integers to handle flipping the low and high halves of AVX 256-bit vectors.
	SmallVector<int, 16> WidenedMask;
	if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
	canWidenShuffleElements(Mask, WidenedMask)) {
	MVT NewEltVT = VT.isFloatingPoint()
	? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
	: MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
	MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
	// Make sure that the new vector type is legal. For example, v2f64 isn't
	// legal on SSE1.
	if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
	V1 = DAG.getBitcast(NewVT, V1);
	V2 = DAG.getBitcast(NewVT, V2);
	return DAG.getBitcast(
	VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
	}
	}

	// Commute the shuffle if it will improve canonicalization.
	if (canonicalizeShuffleMaskWithCommute(Mask))
	return DAG.getCommutedVectorShuffle(*SVOp);

	// For each vector width, delegate to a specialized lowering routine.
	if (VT.is128BitVector())
	return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
	DAG);

	if (VT.is256BitVector())
	return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
	DAG);

	if (VT.is512BitVector())
	return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
	DAG);

	if (Is1BitVector)
	return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);

	llvm_unreachable("Unimplemented!");
	}

	/// \brief Try to lower a VSELECT instruction to a vector shuffle.
	static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue Cond = Op.getOperand(0);
	SDValue LHS = Op.getOperand(1);
	SDValue RHS = Op.getOperand(2);
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();

	if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
	return SDValue();
	auto *CondBV = cast<BuildVectorSDNode>(Cond);

	// Only non-legal VSELECTs reach this lowering, convert those into generic
	// shuffles and re-use the shuffle lowering path for blends.
	SmallVector<int, 32> Mask;
	for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
	SDValue CondElt = CondBV->getOperand(i);
	Mask.push_back(
	isa<ConstantSDNode>(CondElt) ? i + (isNullConstant(CondElt) ? Size : 0)
	: -1);
	}
	return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
	}

	SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
	// A vselect where all conditions and data are constants can be optimized into
	// a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
	if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
	ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
	ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
	return SDValue();

	// If this VSELECT has a vector if i1 as a mask, it will be directly matched
	// with patterns on the mask registers on AVX-512.
	if (Op->getOperand(0).getValueType().getScalarSizeInBits() == 1)
	return Op;

	// Try to lower this to a blend-style vector shuffle. This can handle all
	// constant condition cases.
	if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
	return BlendOp;

	// Variable blends are only legal from SSE4.1 onward.
	if (!Subtarget.hasSSE41())
	return SDValue();

	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();

	// If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
	// into an i1 condition so that we can use the mask-based 512-bit blend
	// instructions.
	if (VT.getSizeInBits() == 512) {
	SDValue Cond = Op.getOperand(0);
	// The vNi1 condition case should be handled above as it can be trivially
	// lowered.
	assert(Cond.getValueType().getScalarSizeInBits() ==
	VT.getScalarSizeInBits() &&
	"Should have a size-matched integer condition!");
	// Build a mask by testing the condition against itself (tests for zero).
	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
	SDValue Mask = DAG.getNode(X86ISD::TESTM, dl, MaskVT, Cond, Cond);
	// Now return a new VSELECT using the mask.
	return DAG.getSelect(dl, VT, Mask, Op.getOperand(1), Op.getOperand(2));
	}

	// Only some types will be legal on some subtargets. If we can emit a legal
	// VSELECT-matching blend, return Op, and but if we need to expand, return
	// a null value.
	switch (VT.SimpleTy) {
	default:
	// Most of the vector types have blends past SSE4.1.
	return Op;

	case MVT::v32i8:
	// The byte blends for AVX vectors were introduced only in AVX2.
	if (Subtarget.hasAVX2())
	return Op;

	return SDValue();

	case MVT::v8i16:
	case MVT::v16i16:
	// AVX-512 BWI and VLX features support VSELECT with i16 elements.
	if (Subtarget.hasBWI() && Subtarget.hasVLX())
	return Op;

	// FIXME: We should custom lower this by fixing the condition and using i8
	// blends.
	return SDValue();
	}
	}

	static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);

	if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
	return SDValue();

	if (VT.getSizeInBits() == 8) {
	SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
	Op.getOperand(0), Op.getOperand(1));
	SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
	DAG.getValueType(VT));
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
	}

	if (VT == MVT::f32) {
	// EXTRACTPS outputs to a GPR32 register which will require a movd to copy
	// the result back to FR32 register. It's only worth matching if the
	// result has a single use which is a store or a bitcast to i32. And in
	// the case of a store, it's not worth it if the index is a constant 0,
	// because a MOVSSmr can be used instead, which is smaller and faster.
	if (!Op.hasOneUse())
	return SDValue();
	SDNode User = Op.getNode()->use_begin();
	if ((User->getOpcode() != ISD::STORE \|\|
	isNullConstant(Op.getOperand(1))) &&
	(User->getOpcode() != ISD::BITCAST \|\|
	User->getValueType(0) != MVT::i32))
	return SDValue();
	SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
	DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
	Op.getOperand(1));
	return DAG.getBitcast(MVT::f32, Extract);
	}

	if (VT == MVT::i32 \|\| VT == MVT::i64) {
	// ExtractPS/pextrq works with constant index.
	if (isa<ConstantSDNode>(Op.getOperand(1)))
	return Op;
	}

	return SDValue();
	}

	/// Extract one bit from mask vector, like v16i1 or v8i1.
	/// AVX-512 feature.
	SDValue
	X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const {
	SDValue Vec = Op.getOperand(0);
	SDLoc dl(Vec);
	MVT VecVT = Vec.getSimpleValueType();
	SDValue Idx = Op.getOperand(1);
	MVT EltVT = Op.getSimpleValueType();

	assert((VecVT.getVectorNumElements() <= 16 \|\| Subtarget.hasBWI()) &&
	"Unexpected vector type in ExtractBitFromMaskVector");

	// variable index can't be handled in mask registers,
	// extend vector to VR512/128
	if (!isa<ConstantSDNode>(Idx)) {
	unsigned NumElts = VecVT.getVectorNumElements();
	// Extending v8i1/v16i1 to 512-bit get better performance on KNL
	// than extending to 128/256bit.
	unsigned VecSize = (NumElts <= 4 ? 128 : 512);
	MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(VecSize/NumElts), NumElts);
	SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVT, Vec);
	SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
	ExtVT.getVectorElementType(), Ext, Idx);
	return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
	}

	unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
	if ((!Subtarget.hasDQI() && (VecVT.getVectorNumElements() == 8)) \|\|
	(VecVT.getVectorNumElements() < 8)) {
	// Use kshiftlw/rw instruction.
	VecVT = MVT::v16i1;
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT,
	DAG.getUNDEF(VecVT),
	Vec,
	DAG.getIntPtrConstant(0, dl));
	}
	unsigned MaxSift = VecVT.getVectorNumElements() - 1;
	if (MaxSift - IdxVal)
	Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
	DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8));
	Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
	DAG.getConstant(MaxSift, dl, MVT::i8));
	return DAG.getNode(X86ISD::VEXTRACT, dl, Op.getSimpleValueType(), Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	SDValue
	X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);
	SDValue Vec = Op.getOperand(0);
	MVT VecVT = Vec.getSimpleValueType();
	SDValue Idx = Op.getOperand(1);

	if (VecVT.getVectorElementType() == MVT::i1)
	return ExtractBitFromMaskVector(Op, DAG);

	if (!isa<ConstantSDNode>(Idx)) {
	// Its more profitable to go through memory (1 cycles throughput)
	// than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
	// IACA tool was used to get performance estimation
	// (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
	//
	// example : extractelement <16 x i8> %a, i32 %i
	//
	// Block Throughput: 3.00 Cycles
	// Throughput Bottleneck: Port5
	//
	// \| Num Of \| Ports pressure in cycles \| \|
	// \| Uops \| 0 - DV \| 5 \| 6 \| 7 \| \|
	// ---------------------------------------------
	// \| 1 \| \| 1.0 \| \| \| CP \| vmovd xmm1, edi
	// \| 1 \| \| 1.0 \| \| \| CP \| vpshufb xmm0, xmm0, xmm1
	// \| 2 \| 1.0 \| 1.0 \| \| \| CP \| vpextrb eax, xmm0, 0x0
	// Total Num Of Uops: 4
	//
	//
	// Block Throughput: 1.00 Cycles
	// Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
	//
	// \| \| Ports pressure in cycles \| \|
	// \|Uops\| 1 \| 2 - D \|3 - D \| 4 \| 5 \| \|
	// ---------------------------------------------------------
	// \|2^ \| \| 0.5 \| 0.5 \|1.0\| \|CP\| vmovaps xmmword ptr [rsp-0x18], xmm0
	// \|1 \|0.5\| \| \| \|0.5\| \| lea rax, ptr [rsp-0x18]
	// \|1 \| \|0.5, 0.5\|0.5, 0.5\| \| \|CP\| mov al, byte ptr [rdi+rax*1]
	// Total Num Of Uops: 4

	return SDValue();
	}

	unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();

	// If this is a 256-bit vector result, first extract the 128-bit vector and
	// then extract the element from the 128-bit vector.
	if (VecVT.is256BitVector() \|\| VecVT.is512BitVector()) {
	// Get the 128-bit vector.
	Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
	MVT EltVT = VecVT.getVectorElementType();

	unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
	assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");

	// Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
	// this can be done with a mask.
	IdxVal &= ElemsPerChunk - 1;
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
	DAG.getConstant(IdxVal, dl, MVT::i32));
	}

	assert(VecVT.is128BitVector() && "Unexpected vector length");

	MVT VT = Op.getSimpleValueType();

	if (VT.getSizeInBits() == 16) {
	// If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
	// we're going to zero extend the register or fold the store (SSE41 only).
	if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
	!(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
	return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
	DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
	DAG.getBitcast(MVT::v4i32, Vec), Idx));

	// Transform it so it match pextrw which produces a 32-bit result.
	SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
	Op.getOperand(0), Op.getOperand(1));
	SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
	DAG.getValueType(VT));
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
	}

	if (Subtarget.hasSSE41())
	if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
	return Res;

	// TODO: We only extract a single element from v16i8, we can probably afford
	// to be more aggressive here before using the default approach of spilling to
	// stack.
	if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
	// Extract either the lowest i32 or any i16, and extract the sub-byte.
	int DWordIdx = IdxVal / 4;
	if (DWordIdx == 0) {
	SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
	DAG.getBitcast(MVT::v4i32, Vec),
	DAG.getIntPtrConstant(DWordIdx, dl));
	int ShiftVal = (IdxVal % 4) * 8;
	if (ShiftVal != 0)
	Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
	DAG.getConstant(ShiftVal, dl, MVT::i32));
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
	}

	int WordIdx = IdxVal / 2;
	SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
	DAG.getBitcast(MVT::v8i16, Vec),
	DAG.getIntPtrConstant(WordIdx, dl));
	int ShiftVal = (IdxVal % 2) * 8;
	if (ShiftVal != 0)
	Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
	DAG.getConstant(ShiftVal, dl, MVT::i16));
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
	}

	if (VT.getSizeInBits() == 32) {
	if (IdxVal == 0)
	return Op;

	// SHUFPS the element to the lowest double word, then movss.
	int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
	Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	if (VT.getSizeInBits() == 64) {
	// FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
	// FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
	// to match extract_elt for f64.
	if (IdxVal == 0)
	return Op;

	// UNPCKHPD the element to the lowest double word, then movsd.
	// Note if the lower 64 bits of the result of the UNPCKHPD is then stored
	// to a f64mem, the whole operation is folded into a single MOVHPDmr.
	int Mask[2] = { 1, -1 };
	Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	return SDValue();
	}

	/// Insert one bit to mask vector, like v16i1 or v8i1.
	/// AVX-512 feature.
	SDValue
	X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
	SDLoc dl(Op);
	SDValue Vec = Op.getOperand(0);
	SDValue Elt = Op.getOperand(1);
	SDValue Idx = Op.getOperand(2);
	MVT VecVT = Vec.getSimpleValueType();

	if (!isa<ConstantSDNode>(Idx)) {
	// Non constant index. Extend source and destination,
	// insert element and then truncate the result.
	MVT ExtVecVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32);
	MVT ExtEltVT = (VecVT == MVT::v8i1 ? MVT::i64 : MVT::i32);
	SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
	DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
	DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
	return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
	}

	unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
	SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
	unsigned NumElems = VecVT.getVectorNumElements();

	if(Vec.isUndef()) {
	if (IdxVal)
	EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
	DAG.getConstant(IdxVal, dl, MVT::i8));
	return EltInVec;
	}

	// Insertion of one bit into first position
	if (IdxVal == 0 ) {
	// Clean top bits of vector.
	EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
	DAG.getConstant(NumElems - 1, dl, MVT::i8));
	EltInVec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, EltInVec,
	DAG.getConstant(NumElems - 1, dl, MVT::i8));
	// Clean the first bit in source vector.
	Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
	DAG.getConstant(1 , dl, MVT::i8));
	Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
	DAG.getConstant(1, dl, MVT::i8));

	return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
	}
	// Insertion of one bit into last position
	if (IdxVal == NumElems -1) {
	// Move the bit to the last position inside the vector.
	EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
	DAG.getConstant(IdxVal, dl, MVT::i8));
	// Clean the last bit in the source vector.
	Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
	DAG.getConstant(1, dl, MVT::i8));
	Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
	DAG.getConstant(1 , dl, MVT::i8));

	return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
	}

	// Use shuffle to insert element.
	SmallVector<int, 64> MaskVec(NumElems);
	for (unsigned i = 0; i != NumElems; ++i)
	MaskVec[i] = (i == IdxVal) ? NumElems : i;

	return DAG.getVectorShuffle(VecVT, dl, Vec, EltInVec, MaskVec);
	}

	SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
	SelectionDAG &DAG) const {
	MVT VT = Op.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	unsigned NumElts = VT.getVectorNumElements();

	if (EltVT == MVT::i1)
	return InsertBitToMaskVector(Op, DAG);

	SDLoc dl(Op);
	SDValue N0 = Op.getOperand(0);
	SDValue N1 = Op.getOperand(1);
	SDValue N2 = Op.getOperand(2);
	if (!isa<ConstantSDNode>(N2))
	return SDValue();
	auto *N2C = cast<ConstantSDNode>(N2);
	unsigned IdxVal = N2C->getZExtValue();

	bool IsZeroElt = X86::isZeroNode(N1);
	bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);

	// If we are inserting a element, see if we can do this more efficiently with
	// a blend shuffle with a rematerializable vector than a costly integer
	// insertion.
	if ((IsZeroElt \|\| IsAllOnesElt) && Subtarget.hasSSE41() &&
	16 <= EltVT.getSizeInBits()) {
	SmallVector<int, 8> BlendMask;
	for (unsigned i = 0; i != NumElts; ++i)
	BlendMask.push_back(i == IdxVal ? i + NumElts : i);
	SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
	: DAG.getConstant(-1, dl, VT);
	return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
	}

	// If the vector is wider than 128 bits, extract the 128-bit subvector, insert
	// into that, and then insert the subvector back into the result.
	if (VT.is256BitVector() \|\| VT.is512BitVector()) {
	// With a 256-bit vector, we can insert into the zero element efficiently
	// using a blend if we have AVX or AVX2 and the right data type.
	if (VT.is256BitVector() && IdxVal == 0) {
	// TODO: It is worthwhile to cast integer to floating point and back
	// and incur a domain crossing penalty if that's what we'll end up
	// doing anyway after extracting to a 128-bit vector.
	if ((Subtarget.hasAVX() && (EltVT == MVT::f64 \|\| EltVT == MVT::f32)) \|\|
	(Subtarget.hasAVX2() && EltVT == MVT::i32)) {
	SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
	N2 = DAG.getIntPtrConstant(1, dl);
	return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
	}
	}

	// Get the desired 128-bit vector chunk.
	SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);

	// Insert the element into the desired chunk.
	unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
	assert(isPowerOf2_32(NumEltsIn128));
	// Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
	unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);

	V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
	DAG.getConstant(IdxIn128, dl, MVT::i32));

	// Insert the changed part back into the bigger vector
	return insert128BitVector(N0, V, IdxVal, DAG, dl);
	}
	assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");

	// Transform it so it match pinsr{b,w} which expects a GR32 as its second
	// argument. SSE41 required for pinsrb.
	if (VT == MVT::v8i16 \|\| (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
	unsigned Opc;
	if (VT == MVT::v8i16) {
	assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
	Opc = X86ISD::PINSRW;
	} else {
	assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
	assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
	Opc = X86ISD::PINSRB;
	}

	if (N1.getValueType() != MVT::i32)
	N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
	if (N2.getValueType() != MVT::i32)
	N2 = DAG.getIntPtrConstant(IdxVal, dl);
	return DAG.getNode(Opc, dl, VT, N0, N1, N2);
	}

	if (Subtarget.hasSSE41()) {
	if (EltVT == MVT::f32) {
	// Bits [7:6] of the constant are the source select. This will always be
	// zero here. The DAG Combiner may combine an extract_elt index into
	// these bits. For example (insert (extract, 3), 2) could be matched by
	// putting the '3' into bits [7:6] of X86ISD::INSERTPS.
	// Bits [5:4] of the constant are the destination select. This is the
	// value of the incoming immediate.
	// Bits [3:0] of the constant are the zero mask. The DAG Combiner may
	// combine either bitwise AND or insert of float 0.0 to set these bits.

	bool MinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
	if (IdxVal == 0 && (!MinSize \|\| !MayFoldLoad(N1))) {
	// If this is an insertion of 32-bits into the low 32-bits of
	// a vector, we prefer to generate a blend with immediate rather
	// than an insertps. Blends are simpler operations in hardware and so
	// will always have equal or better performance than insertps.
	// But if optimizing for size and there's a load folding opportunity,
	// generate insertps because blendps does not have a 32-bit memory
	// operand form.
	N2 = DAG.getIntPtrConstant(1, dl);
	N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
	return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
	}
	N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
	// Create this as a scalar to vector..
	N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
	return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
	}

	// PINSR* works with constant index.
	if (EltVT == MVT::i32 \|\| EltVT == MVT::i64)
	return Op;
	}

	return SDValue();
	}

	static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDLoc dl(Op);
	MVT OpVT = Op.getSimpleValueType();

	// It's always cheaper to replace a xor+movd with xorps and simplifies further
	// combines.
	if (X86::isZeroNode(Op.getOperand(0)))
	return getZeroVector(OpVT, Subtarget, DAG, dl);

	// If this is a 256-bit vector result, first insert into a 128-bit
	// vector and then insert into the 256-bit vector.
	if (!OpVT.is128BitVector()) {
	// Insert into a 128-bit vector.
	unsigned SizeFactor = OpVT.getSizeInBits() / 128;
	MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
	OpVT.getVectorNumElements() / SizeFactor);

	Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));

	// Insert the 128-bit vector.
	return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
	}
	assert(OpVT.is128BitVector() && "Expected an SSE type!");

	// Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
	if (OpVT == MVT::v4i32)
	return Op;

	SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
	return DAG.getBitcast(
	OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
	}

	// Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in
	// a simple subregister reference or explicit instructions to grab
	// upper bits of a vector.
	static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.hasAVX() && "EXTRACT_SUBVECTOR requires AVX");

	SDLoc dl(Op);
	SDValue In = Op.getOperand(0);
	SDValue Idx = Op.getOperand(1);
	unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
	MVT ResVT = Op.getSimpleValueType();

	+ // When v1i1 is legal a scalarization of a vselect with a vXi1 Cond
	+ // would result with: v1i1 = extract_subvector(vXi1, idx).
	+ // Lower these into extract_vector_elt which is already selectable.
	+ if (ResVT == MVT::v1i1) {
	+ assert(Subtarget.hasAVX512() &&
	+ "Boolean EXTRACT_SUBVECTOR requires AVX512");
	+
	+ MVT EltVT = ResVT.getVectorElementType();
	+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	+ MVT LegalVT =
	+ (TLI.getTypeToTransformTo(*DAG.getContext(), EltVT)).getSimpleVT();
	+ SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LegalVT, In, Idx);
	+ return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ResVT, Res);
	+ }
	+
	assert((In.getSimpleValueType().is256BitVector() \|\|
	In.getSimpleValueType().is512BitVector()) &&
	"Can only extract from 256-bit or 512-bit vectors");

	// If the input is a buildvector just emit a smaller one.
	unsigned ElemsPerChunk = ResVT.getVectorNumElements();
	if (In.getOpcode() == ISD::BUILD_VECTOR)
	return DAG.getBuildVector(
	ResVT, dl, makeArrayRef(In->op_begin() + IdxVal, ElemsPerChunk));

	// Everything else is legal.
	return Op;
	}

	// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
	// simple superregister reference or explicit instructions to insert
	// the upper bits of a vector.
	static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);

	return insert1BitVector(Op, DAG, Subtarget);
	}

	// Returns the appropriate wrapper opcode for a global reference.
	unsigned X86TargetLowering::getGlobalWrapperKind(const GlobalValue *GV) const {
	// References to absolute symbols are never PC-relative.
	if (GV && GV->isAbsoluteSymbolRef())
	return X86ISD::Wrapper;

	CodeModel::Model M = getTargetMachine().getCodeModel();
	if (Subtarget.isPICStyleRIPRel() &&
	(M == CodeModel::Small \|\| M == CodeModel::Kernel))
	return X86ISD::WrapperRIP;

	return X86ISD::Wrapper;
	}

	// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
	// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
	// one of the above mentioned nodes. It has to be wrapped because otherwise
	// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
	// be used to form addressing mode. These wrapped nodes will be selected
	// into MOV32ri.
	SDValue
	X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
	ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);

	// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
	// global base reg.
	unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);

	auto PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue Result = DAG.getTargetConstantPool(
	CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
	SDLoc DL(CP);
	Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
	// With PIC, the address is actually $g + Offset.
	if (OpFlag) {
	Result =
	DAG.getNode(ISD::ADD, DL, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
	}

	return Result;
	}

	SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
	JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);

	// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
	// global base reg.
	unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);

	auto PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
	SDLoc DL(JT);
	Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);

	// With PIC, the address is actually $g + Offset.
	if (OpFlag)
	Result =
	DAG.getNode(ISD::ADD, DL, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);

	return Result;
	}

	SDValue
	X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
	const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();

	// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
	// global base reg.
	const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
	unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod);

	auto PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);

	SDLoc DL(Op);
	Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);

	// With PIC, the address is actually $g + Offset.
	if (isPositionIndependent() && !Subtarget.is64Bit()) {
	Result =
	DAG.getNode(ISD::ADD, DL, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
	}

	// For symbols that require a load from a stub to get the address, emit the
	// load.
	if (isGlobalStubReference(OpFlag))
	Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
	MachinePointerInfo::getGOT(DAG.getMachineFunction()));

	return Result;
	}

	SDValue
	X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
	// Create the TargetBlockAddressAddress node.
	unsigned char OpFlags =
	Subtarget.classifyBlockAddressReference();
	const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
	int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
	SDLoc dl(Op);
	auto PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
	Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);

	// With PIC, the address is actually $g + Offset.
	if (isGlobalRelativeToPICBase(OpFlags)) {
	Result = DAG.getNode(ISD::ADD, dl, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
	}

	return Result;
	}

	SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
	const SDLoc &dl, int64_t Offset,
	SelectionDAG &DAG) const {
	// Create the TargetGlobalAddress node, folding in the constant
	// offset if it is legal.
	unsigned char OpFlags = Subtarget.classifyGlobalReference(GV);
	CodeModel::Model M = DAG.getTarget().getCodeModel();
	auto PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue Result;
	if (OpFlags == X86II::MO_NO_FLAG &&
	X86::isOffsetSuitableForCodeModel(Offset, M)) {
	// A direct static reference to a global.
	Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
	Offset = 0;
	} else {
	Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
	}

	Result = DAG.getNode(getGlobalWrapperKind(GV), dl, PtrVT, Result);

	// With PIC, the address is actually $g + Offset.
	if (isGlobalRelativeToPICBase(OpFlags)) {
	Result = DAG.getNode(ISD::ADD, dl, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
	}

	// For globals that require a load from a stub to get the address, emit the
	// load.
	if (isGlobalStubReference(OpFlags))
	Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
	MachinePointerInfo::getGOT(DAG.getMachineFunction()));

	// If there was a non-zero offset that we didn't fold, create an explicit
	// addition for it.
	if (Offset != 0)
	Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
	DAG.getConstant(Offset, dl, PtrVT));

	return Result;
	}

	SDValue
	X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
	const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
	int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
	return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
	}

	static SDValue
	GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
	SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
	unsigned char OperandFlags, bool LocalDynamic = false) {
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
	SDLoc dl(GA);
	SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
	GA->getValueType(0),
	GA->getOffset(),
	OperandFlags);

	X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
	: X86ISD::TLSADDR;

	if (InFlag) {
	SDValue Ops[] = { Chain, TGA, *InFlag };
	Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
	} else {
	SDValue Ops[] = { Chain, TGA };
	Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
	}

	// TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
	MFI.setAdjustsStack(true);
	MFI.setHasCalls(true);

	SDValue Flag = Chain.getValue(1);
	return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
	}

	// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
	static SDValue
	LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
	const EVT PtrVT) {
	SDValue InFlag;
	SDLoc dl(GA); // ? function entry point might be better
	SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
	DAG.getNode(X86ISD::GlobalBaseReg,
	SDLoc(), PtrVT), InFlag);
	InFlag = Chain.getValue(1);

	return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
	}

	// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
	static SDValue
	LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
	const EVT PtrVT) {
	return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
	X86::RAX, X86II::MO_TLSGD);
	}

	static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
	SelectionDAG &DAG,
	const EVT PtrVT,
	bool is64Bit) {
	SDLoc dl(GA);

	// Get the start address of the TLS block for this module.
	X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
	.getInfo<X86MachineFunctionInfo>();
	MFI->incNumLocalDynamicTLSAccesses();

	SDValue Base;
	if (is64Bit) {
	Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
	X86II::MO_TLSLD, /LocalDynamic=/true);
	} else {
	SDValue InFlag;
	SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
	InFlag = Chain.getValue(1);
	Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
	X86II::MO_TLSLDM, /LocalDynamic=/true);
	}

	// Note: the CleanupLocalDynamicTLSPass will remove redundant computations
	// of Base.

	// Build x@dtpoff.
	unsigned char OperandFlags = X86II::MO_DTPOFF;
	unsigned WrapperKind = X86ISD::Wrapper;
	SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
	GA->getValueType(0),
	GA->getOffset(), OperandFlags);
	SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);

	// Add x@dtpoff with the base.
	return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
	}

	// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
	static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
	const EVT PtrVT, TLSModel::Model model,
	bool is64Bit, bool isPIC) {
	SDLoc dl(GA);

	// Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
	Value Ptr = Constant::getNullValue(Type::getInt8PtrTy(DAG.getContext(),
	is64Bit ? 257 : 256));

	SDValue ThreadPointer =
	DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
	MachinePointerInfo(Ptr));

	unsigned char OperandFlags = 0;
	// Most TLS accesses are not RIP relative, even on x86-64. One exception is
	// initialexec.
	unsigned WrapperKind = X86ISD::Wrapper;
	if (model == TLSModel::LocalExec) {
	OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
	} else if (model == TLSModel::InitialExec) {
	if (is64Bit) {
	OperandFlags = X86II::MO_GOTTPOFF;
	WrapperKind = X86ISD::WrapperRIP;
	} else {
	OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
	}
	} else {
	llvm_unreachable("Unexpected model");
	}

	// emit "addl x@ntpoff,%eax" (local exec)
	// or "addl x@indntpoff,%eax" (initial exec)
	// or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
	SDValue TGA =
	DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
	GA->getOffset(), OperandFlags);
	SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);

	if (model == TLSModel::InitialExec) {
	if (isPIC && !is64Bit) {
	Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
	Offset);
	}

	Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
	MachinePointerInfo::getGOT(DAG.getMachineFunction()));
	}

	// The address of the thread local variable is the add of the thread
	// pointer with the offset of the variable.
	return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
	}

	SDValue
	X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {

	GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);

	if (DAG.getTarget().Options.EmulatedTLS)
	return LowerToTLSEmulatedModel(GA, DAG);

	const GlobalValue *GV = GA->getGlobal();
	auto PtrVT = getPointerTy(DAG.getDataLayout());
	bool PositionIndependent = isPositionIndependent();

	if (Subtarget.isTargetELF()) {
	TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
	switch (model) {
	case TLSModel::GeneralDynamic:
	if (Subtarget.is64Bit())
	return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
	return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
	case TLSModel::LocalDynamic:
	return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
	Subtarget.is64Bit());
	case TLSModel::InitialExec:
	case TLSModel::LocalExec:
	return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
	PositionIndependent);
	}
	llvm_unreachable("Unknown TLS model.");
	}

	if (Subtarget.isTargetDarwin()) {
	// Darwin only has one model of TLS. Lower to that.
	unsigned char OpFlag = 0;
	unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
	X86ISD::WrapperRIP : X86ISD::Wrapper;

	// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
	// global base reg.
	bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
	if (PIC32)
	OpFlag = X86II::MO_TLVP_PIC_BASE;
	else
	OpFlag = X86II::MO_TLVP;
	SDLoc DL(Op);
	SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
	GA->getValueType(0),
	GA->getOffset(), OpFlag);
	SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);

	// With PIC32, the address is actually $g + Offset.
	if (PIC32)
	Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
	Offset);

	// Lowering the machine isd will make sure everything is in the right
	// location.
	SDValue Chain = DAG.getEntryNode();
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
	Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
	SDValue Args[] = { Chain, Offset };
	Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
	Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
	DAG.getIntPtrConstant(0, DL, true),
	Chain.getValue(1), DL);

	// TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	MFI.setAdjustsStack(true);

	// And our return value (tls address) is in the standard call return value
	// location.
	unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
	return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
	}

	if (Subtarget.isTargetKnownWindowsMSVC() \|\|
	Subtarget.isTargetWindowsItanium() \|\|
	Subtarget.isTargetWindowsGNU()) {
	// Just use the implicit TLS architecture
	// Need to generate something similar to:
	// mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
	// ; from TEB
	// mov ecx, dword [rel _tls_index]: Load index (from C runtime)
	// mov rcx, qword [rdx+rcx*8]
	// mov eax, .tls$:tlsvar
	// [rax+rcx] contains the address
	// Windows 64bit: gs:0x58
	// Windows 32bit: fs:__tls_array

	SDLoc dl(GA);
	SDValue Chain = DAG.getEntryNode();

	// Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
	// %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
	// use its literal value of 0x2C.
	Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
	? Type::getInt8PtrTy(*DAG.getContext(),
	256)
	: Type::getInt32PtrTy(*DAG.getContext(),
	257));

	SDValue TlsArray = Subtarget.is64Bit()
	? DAG.getIntPtrConstant(0x58, dl)
	: (Subtarget.isTargetWindowsGNU()
	? DAG.getIntPtrConstant(0x2C, dl)
	: DAG.getExternalSymbol("_tls_array", PtrVT));

	SDValue ThreadPointer =
	DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));

	SDValue res;
	if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
	res = ThreadPointer;
	} else {
	// Load the _tls_index variable
	SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
	if (Subtarget.is64Bit())
	IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
	MachinePointerInfo(), MVT::i32);
	else
	IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());

	auto &DL = DAG.getDataLayout();
	SDValue Scale =
	DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, PtrVT);
	IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);

	res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
	}

	res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());

	// Get the offset of start of .tls section
	SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
	GA->getValueType(0),
	GA->getOffset(), X86II::MO_SECREL);
	SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);

	// The address of the thread local variable is the add of the thread
	// pointer with the offset of the variable.
	return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
	}

	llvm_unreachable("TLS not implemented for this target.");
	}

	/// Lower SRA_PARTS and friends, which return two i32 values
	/// and take a 2 x i32 value to shift plus a shift amount.
	static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
	assert(Op.getNumOperands() == 3 && "Not a double-shift!");
	MVT VT = Op.getSimpleValueType();
	unsigned VTBits = VT.getSizeInBits();
	SDLoc dl(Op);
	bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
	SDValue ShOpLo = Op.getOperand(0);
	SDValue ShOpHi = Op.getOperand(1);
	SDValue ShAmt = Op.getOperand(2);
	// X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
	// generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
	// during isel.
	SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
	DAG.getConstant(VTBits - 1, dl, MVT::i8));
	SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
	DAG.getConstant(VTBits - 1, dl, MVT::i8))
	: DAG.getConstant(0, dl, VT);

	SDValue Tmp2, Tmp3;
	if (Op.getOpcode() == ISD::SHL_PARTS) {
	Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
	Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
	} else {
	Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
	Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
	}

	// If the shift amount is larger or equal than the width of a part we can't
	// rely on the results of shld/shrd. Insert a test and select the appropriate
	// values for large shift amounts.
	SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
	DAG.getConstant(VTBits, dl, MVT::i8));
	SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
	AndNode, DAG.getConstant(0, dl, MVT::i8));

	SDValue Hi, Lo;
	SDValue CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
	SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
	SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };

	if (Op.getOpcode() == ISD::SHL_PARTS) {
	Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
	Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
	} else {
	Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
	Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
	}

	SDValue Ops[2] = { Lo, Hi };
	return DAG.getMergeValues(Ops, dl);
	}

	SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
	SelectionDAG &DAG) const {
	SDValue Src = Op.getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (SrcVT.isVector()) {
	if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
	return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
	DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
	DAG.getUNDEF(SrcVT)));
	}
	if (SrcVT.getVectorElementType() == MVT::i1) {
	if (SrcVT == MVT::v2i1 && TLI.isTypeLegal(SrcVT))
	return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
	DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v2i64, Src));
	MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
	return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
	DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, Src));
	}
	return SDValue();
	}

	assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
	"Unknown SINT_TO_FP to lower!");

	// These are really Legal; return the operand so the caller accepts it as
	// Legal.
	if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
	return Op;
	if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
	Subtarget.is64Bit()) {
	return Op;
	}

	SDValue ValueToStore = Op.getOperand(0);
	if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
	!Subtarget.is64Bit())
	// Bitcasting to f64 here allows us to do a single 64-bit store from
	// an SSE register, avoiding the store forwarding penalty that would come
	// with two 32-bit stores.
	ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);

	unsigned Size = SrcVT.getSizeInBits()/8;
	MachineFunction &MF = DAG.getMachineFunction();
	auto PtrVT = getPointerTy(MF.getDataLayout());
	int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);
	SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
	SDValue Chain = DAG.getStore(
	DAG.getEntryNode(), dl, ValueToStore, StackSlot,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
	return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
	}

	SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
	SDValue StackSlot,
	SelectionDAG &DAG) const {
	// Build the FILD
	SDLoc DL(Op);
	SDVTList Tys;
	bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
	if (useSSE)
	Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
	else
	Tys = DAG.getVTList(Op.getValueType(), MVT::Other);

	unsigned ByteSize = SrcVT.getSizeInBits()/8;

	FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
	MachineMemOperand *MMO;
	if (FI) {
	int SSFI = FI->getIndex();
	MMO = DAG.getMachineFunction().getMachineMemOperand(
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
	MachineMemOperand::MOLoad, ByteSize, ByteSize);
	} else {
	MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
	StackSlot = StackSlot.getOperand(1);
	}
	SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
	SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
	X86ISD::FILD, DL,
	Tys, Ops, SrcVT, MMO);

	if (useSSE) {
	Chain = Result.getValue(1);
	SDValue InFlag = Result.getValue(2);

	// FIXME: Currently the FST is flagged to the FILD_FLAG. This
	// shouldn't be necessary except that RFP cannot be live across
	// multiple blocks. When stackifier is fixed, they can be uncoupled.
	MachineFunction &MF = DAG.getMachineFunction();
	unsigned SSFISize = Op.getValueSizeInBits()/8;
	int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false);
	auto PtrVT = getPointerTy(MF.getDataLayout());
	SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
	Tys = DAG.getVTList(MVT::Other);
	SDValue Ops[] = {
	Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
	};
	MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
	MachineMemOperand::MOStore, SSFISize, SSFISize);

	Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
	Ops, Op.getValueType(), MMO);
	Result = DAG.getLoad(
	Op.getValueType(), DL, Chain, StackSlot,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
	}

	return Result;
	}

	/// 64-bit unsigned integer to double expansion.
	SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
	SelectionDAG &DAG) const {
	// This algorithm is not obvious. Here it is what we're trying to output:
	/*
	movq %rax, %xmm0
	punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
	subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
	#ifdef __SSE3__
	haddpd %xmm0, %xmm0
	#else
	pshufd $0x4e, %xmm0, %xmm1
	addpd %xmm1, %xmm0
	#endif
	*/

	SDLoc dl(Op);
	LLVMContext *Context = DAG.getContext();

	// Build some magic constants.
	static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
	Constant C0 = ConstantDataVector::get(Context, CV0);
	auto PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);

	SmallVector<Constant*,2> CV1;
	CV1.push_back(
	ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
	APInt(64, 0x4330000000000000ULL))));
	CV1.push_back(
	ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
	APInt(64, 0x4530000000000000ULL))));
	Constant *C1 = ConstantVector::get(CV1);
	SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);

	// Load the 64-bit value into an XMM register.
	SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
	Op.getOperand(0));
	SDValue CLod0 =
	DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	/* Alignment = */ 16);
	SDValue Unpck1 =
	getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);

	SDValue CLod1 =
	DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	/* Alignment = */ 16);
	SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
	// TODO: Are there any fast-math-flags to propagate here?
	SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
	SDValue Result;

	if (Subtarget.hasSSE3()) {
	// FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
	Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
	} else {
	SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub);
	SDValue Shuffle = DAG.getVectorShuffle(MVT::v4i32, dl, S2F, S2F, {2,3,0,1});
	Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
	DAG.getBitcast(MVT::v2f64, Shuffle), Sub);
	}

	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
	DAG.getIntPtrConstant(0, dl));
	}

	/// 32-bit unsigned integer to float expansion.
	SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);
	// FP constant to bias correct the final result.
	SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
	MVT::f64);

	// Load the 32-bit value into an XMM register.
	SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
	Op.getOperand(0));

	// Zero out the upper parts of the register.
	Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);

	Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
	DAG.getBitcast(MVT::v2f64, Load),
	DAG.getIntPtrConstant(0, dl));

	// Or the load with the bias.
	SDValue Or = DAG.getNode(
	ISD::OR, dl, MVT::v2i64,
	DAG.getBitcast(MVT::v2i64,
	DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
	DAG.getBitcast(MVT::v2i64,
	DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
	Or =
	DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
	DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));

	// Subtract the bias.
	// TODO: Are there any fast-math-flags to propagate here?
	SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);

	// Handle final rounding.
	MVT DestVT = Op.getSimpleValueType();

	if (DestVT.bitsLT(MVT::f64))
	return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
	DAG.getIntPtrConstant(0, dl));
	if (DestVT.bitsGT(MVT::f64))
	return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);

	// Handle final rounding.
	return Sub;
	}

	static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget, SDLoc &DL) {
	if (Op.getSimpleValueType() != MVT::v2f64)
	return SDValue();

	SDValue N0 = Op.getOperand(0);
	assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");

	// Legalize to v4i32 type.
	N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
	DAG.getUNDEF(MVT::v2i32));

	if (Subtarget.hasAVX512())
	return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);

	// Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT,
	// but using v2i32 to v2f64 with X86ISD::CVTSI2P.
	SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32);
	SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32);

	// Two to the power of half-word-size.
	SDValue TWOHW = DAG.getConstantFP(1 << 16, DL, MVT::v2f64);

	// Clear upper part of LO, lower HI.
	SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord);
	SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask);

	SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI);
	fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW);
	SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO);

	// Add the two halves.
	return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO);
	}

	static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// The algorithm is the following:
	// #ifdef __SSE4_1__
	// uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
	// uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
	// (uint4) 0x53000000, 0xaa);
	// #else
	// uint4 lo = (v & (uint4) 0xffff) \| (uint4) 0x4b000000;
	// uint4 hi = (v >> 16) \| (uint4) 0x53000000;
	// #endif
	// float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
	// return (float4) lo + fhi;

	// We shouldn't use it when unsafe-fp-math is enabled though: we might later
	// reassociate the two FADDs, and if we do that, the algorithm fails
	// spectacularly (PR24512).
	// FIXME: If we ever have some kind of Machine FMF, this should be marked
	// as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
	// there's also the MachineCombiner reassociations happening on Machine IR.
	if (DAG.getTarget().Options.UnsafeFPMath)
	return SDValue();

	SDLoc DL(Op);
	SDValue V = Op->getOperand(0);
	MVT VecIntVT = V.getSimpleValueType();
	bool Is128 = VecIntVT == MVT::v4i32;
	MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
	// If we convert to something else than the supported type, e.g., to v4f64,
	// abort early.
	if (VecFloatVT != Op->getSimpleValueType(0))
	return SDValue();

	assert((VecIntVT == MVT::v4i32 \|\| VecIntVT == MVT::v8i32) &&
	"Unsupported custom type");

	// In the #idef/#else code, we have in common:
	// - The vector of constants:
	// -- 0x4b000000
	// -- 0x53000000
	// - A shift:
	// -- v >> 16

	// Create the splat vector for 0x4b000000.
	SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
	// Create the splat vector for 0x53000000.
	SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);

	// Create the right shift.
	SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
	SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);

	SDValue Low, High;
	if (Subtarget.hasSSE41()) {
	MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
	// uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
	SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
	SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
	// Low will be bitcasted right away, so do not bother bitcasting back to its
	// original type.
	Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
	VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
	// uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
	// (uint4) 0x53000000, 0xaa);
	SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
	SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
	// High will be bitcasted right away, so do not bother bitcasting back to
	// its original type.
	High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
	VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
	} else {
	SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
	// uint4 lo = (v & (uint4) 0xffff) \| (uint4) 0x4b000000;
	SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
	Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);

	// uint4 hi = (v >> 16) \| (uint4) 0x53000000;
	High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
	}

	// Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
	SDValue VecCstFAdd = DAG.getConstantFP(
	APFloat(APFloat::IEEEsingle(), APInt(32, 0xD3000080)), DL, VecFloatVT);

	// float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
	SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
	// TODO: Are there any fast-math-flags to propagate here?
	SDValue FHigh =
	DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
	// return (float4) lo + fhi;
	SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
	return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
	}

	SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
	SelectionDAG &DAG) const {
	SDValue N0 = Op.getOperand(0);
	MVT SrcVT = N0.getSimpleValueType();
	SDLoc dl(Op);

	if (SrcVT.getVectorElementType() == MVT::i1) {
	if (SrcVT == MVT::v2i1)
	return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
	DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, N0));
	MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
	return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
	DAG.getNode(ISD::ZERO_EXTEND, dl, IntegerVT, N0));
	}

	switch (SrcVT.SimpleTy) {
	default:
	llvm_unreachable("Custom UINT_TO_FP is not supported!");
	case MVT::v4i8:
	case MVT::v4i16:
	case MVT::v8i8:
	case MVT::v8i16: {
	MVT NVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
	return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
	DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
	}
	case MVT::v2i32:
	return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
	case MVT::v4i32:
	case MVT::v8i32:
	return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
	case MVT::v16i8:
	case MVT::v16i16:
	assert(Subtarget.hasAVX512());
	return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
	DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, N0));
	}
	}

	SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
	SelectionDAG &DAG) const {
	SDValue N0 = Op.getOperand(0);
	SDLoc dl(Op);
	auto PtrVT = getPointerTy(DAG.getDataLayout());

	// Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
	// optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
	// the optimization here.
	if (DAG.SignBitIsZero(N0))
	return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);

	if (Op.getSimpleValueType().isVector())
	return lowerUINT_TO_FP_vec(Op, DAG);

	MVT SrcVT = N0.getSimpleValueType();
	MVT DstVT = Op.getSimpleValueType();

	if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
	(SrcVT == MVT::i32 \|\| (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
	// Conversions from unsigned i32 to f32/f64 are legal,
	// using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
	return Op;
	}

	if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
	return LowerUINT_TO_FP_i64(Op, DAG);
	if (SrcVT == MVT::i32 && X86ScalarSSEf64)
	return LowerUINT_TO_FP_i32(Op, DAG);
	if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
	return SDValue();

	// Make a 64-bit buffer, and use it to build an FILD.
	SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
	if (SrcVT == MVT::i32) {
	SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
	SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
	StackSlot, MachinePointerInfo());
	SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
	OffsetSlot, MachinePointerInfo());
	SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
	return Fild;
	}

	assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
	SDValue ValueToStore = Op.getOperand(0);
	if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit())
	// Bitcasting to f64 here allows us to do a single 64-bit store from
	// an SSE register, avoiding the store forwarding penalty that would come
	// with two 32-bit stores.
	ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
	SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot,
	MachinePointerInfo());
	// For i64 source, we need to add the appropriate power of 2 if the input
	// was negative. This is the same as the optimization in
	// DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
	// we must be careful to do the computation in x87 extended precision, not
	// in SSE. (The generic code can't know it's OK to do this, or how to.)
	int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
	MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
	MachineMemOperand::MOLoad, 8, 8);

	SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
	SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
	SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
	MVT::i64, MMO);

	APInt FF(32, 0x5F800000ULL);

	// Check whether the sign bit is set.
	SDValue SignSet = DAG.getSetCC(
	dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
	Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);

	// Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
	SDValue FudgePtr = DAG.getConstantPool(
	ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);

	// Get a pointer to FF if the sign bit was set, or to 0 otherwise.
	SDValue Zero = DAG.getIntPtrConstant(0, dl);
	SDValue Four = DAG.getIntPtrConstant(4, dl);
	SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Zero, Four);
	FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);

	// Load the value out, extending it from f32 to f80.
	// FIXME: Avoid the extend by constructing the right constant pool?
	SDValue Fudge = DAG.getExtLoad(
	ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
	/* Alignment = */ 4);
	// Extend everything to 80 bits to force it to be done on x87.
	// TODO: Are there any fast-math-flags to propagate here?
	SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
	return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
	DAG.getIntPtrConstant(0, dl));
	}

	// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
	// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
	// just return an <SDValue(), SDValue()> pair.
	// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
	// to i16, i32 or i64, and we lower it to a legal sequence.
	// If lowered to the final integer result we return a <result, SDValue()> pair.
	// Otherwise we lower it to a sequence ending with a FIST, return a
	// <FIST, StackSlot> pair, and the caller is responsible for loading
	// the final integer result from StackSlot.
	std::pair<SDValue,SDValue>
	X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
	bool IsSigned, bool IsReplace) const {
	SDLoc DL(Op);

	EVT DstTy = Op.getValueType();
	EVT TheVT = Op.getOperand(0).getValueType();
	auto PtrVT = getPointerTy(DAG.getDataLayout());

	if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
	// f16 must be promoted before using the lowering in this routine.
	// fp128 does not use this lowering.
	return std::make_pair(SDValue(), SDValue());
	}

	// If using FIST to compute an unsigned i64, we'll need some fixup
	// to handle values above the maximum signed i64. A FIST is always
	// used for the 32-bit subtarget, but also for f80 on a 64-bit target.
	bool UnsignedFixup = !IsSigned &&
	DstTy == MVT::i64 &&
	(!Subtarget.is64Bit() \|\|
	!isScalarFPTypeInSSEReg(TheVT));

	if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) {
	// Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
	// The low 32 bits of the fist result will have the correct uint32 result.
	assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
	DstTy = MVT::i64;
	}

	assert(DstTy.getSimpleVT() <= MVT::i64 &&
	DstTy.getSimpleVT() >= MVT::i16 &&
	"Unknown FP_TO_INT to lower!");

	// These are really Legal.
	if (DstTy == MVT::i32 &&
	isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
	return std::make_pair(SDValue(), SDValue());
	if (Subtarget.is64Bit() &&
	DstTy == MVT::i64 &&
	isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
	return std::make_pair(SDValue(), SDValue());

	// We lower FP->int64 into FISTP64 followed by a load from a temporary
	// stack slot.
	MachineFunction &MF = DAG.getMachineFunction();
	unsigned MemSize = DstTy.getSizeInBits()/8;
	int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
	SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);

	unsigned Opc;
	switch (DstTy.getSimpleVT().SimpleTy) {
	default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
	case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
	case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
	case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
	}

	SDValue Chain = DAG.getEntryNode();
	SDValue Value = Op.getOperand(0);
	SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.

	if (UnsignedFixup) {
	//
	// Conversion to unsigned i64 is implemented with a select,
	// depending on whether the source value fits in the range
	// of a signed i64. Let Thresh be the FP equivalent of
	// 0x8000000000000000ULL.
	//
	// Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
	// FistSrc = (Value < Thresh) ? Value : (Value - Thresh);
	// Fist-to-mem64 FistSrc
	// Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
	// to XOR'ing the high 32 bits with Adjust.
	//
	// Being a power of 2, Thresh is exactly representable in all FP formats.
	// For X87 we'd like to use the smallest FP type for this constant, but
	// for DAG type consistency we have to match the FP operand type.

	APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
	LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
	bool LosesInfo = false;
	if (TheVT == MVT::f64)
	// The rounding mode is irrelevant as the conversion should be exact.
	Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
	&LosesInfo);
	else if (TheVT == MVT::f80)
	Status = Thresh.convert(APFloat::x87DoubleExtended(),
	APFloat::rmNearestTiesToEven, &LosesInfo);

	assert(Status == APFloat::opOK && !LosesInfo &&
	"FP conversion should have been exact");

	SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);

	SDValue Cmp = DAG.getSetCC(DL,
	getSetCCResultType(DAG.getDataLayout(),
	*DAG.getContext(), TheVT),
	Value, ThreshVal, ISD::SETLT);
	Adjust = DAG.getSelect(DL, MVT::i32, Cmp,
	DAG.getConstant(0, DL, MVT::i32),
	DAG.getConstant(0x80000000, DL, MVT::i32));
	SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
	Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
	*DAG.getContext(), TheVT),
	Value, ThreshVal, ISD::SETLT);
	Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
	}

	// FIXME This causes a redundant load/store if the SSE-class value is already
	// in memory, such as if it is on the callstack.
	if (isScalarFPTypeInSSEReg(TheVT)) {
	assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
	Chain = DAG.getStore(Chain, DL, Value, StackSlot,
	MachinePointerInfo::getFixedStack(MF, SSFI));
	SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
	SDValue Ops[] = {
	Chain, StackSlot, DAG.getValueType(TheVT)
	};

	MachineMemOperand *MMO =
	MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
	MachineMemOperand::MOLoad, MemSize, MemSize);
	Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
	Chain = Value.getValue(1);
	SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
	StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
	}

	MachineMemOperand *MMO =
	MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
	MachineMemOperand::MOStore, MemSize, MemSize);

	if (UnsignedFixup) {

	// Insert the FIST, load its result as two i32's,
	// and XOR the high i32 with Adjust.

	SDValue FistOps[] = { Chain, Value, StackSlot };
	SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
	FistOps, DstTy, MMO);

	SDValue Low32 =
	DAG.getLoad(MVT::i32, DL, FIST, StackSlot, MachinePointerInfo());
	SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL);

	SDValue High32 =
	DAG.getLoad(MVT::i32, DL, FIST, HighAddr, MachinePointerInfo());
	High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);

	if (Subtarget.is64Bit()) {
	// Join High32 and Low32 into a 64-bit result.
	// (High32 << 32) \| Low32
	Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
	High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);
	High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,
	DAG.getConstant(32, DL, MVT::i8));
	SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);
	return std::make_pair(Result, SDValue());
	}

	SDValue ResultOps[] = { Low32, High32 };

	SDValue pair = IsReplace
	? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)
	: DAG.getMergeValues(ResultOps, DL);
	return std::make_pair(pair, SDValue());
	} else {
	// Build the FP_TO_INT*_IN_MEM
	SDValue Ops[] = { Chain, Value, StackSlot };
	SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
	Ops, DstTy, MMO);
	return std::make_pair(FIST, StackSlot);
	}
	}

	static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Op->getSimpleValueType(0);
	SDValue In = Op->getOperand(0);
	MVT InVT = In.getSimpleValueType();
	SDLoc dl(Op);

	if (VT.is512BitVector() \|\| InVT.getVectorElementType() == MVT::i1)
	return DAG.getNode(ISD::ZERO_EXTEND, dl, VT, In);

	// Optimize vectors in AVX mode:
	//
	// v8i16 -> v8i32
	// Use vpunpcklwd for 4 lower elements v8i16 -> v4i32.
	// Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
	// Concat upper and lower parts.
	//
	// v4i32 -> v4i64
	// Use vpunpckldq for 4 lower elements v4i32 -> v2i64.
	// Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
	// Concat upper and lower parts.
	//

	if (((VT != MVT::v16i16) \|\| (InVT != MVT::v16i8)) &&
	((VT != MVT::v8i32) \|\| (InVT != MVT::v8i16)) &&
	((VT != MVT::v4i64) \|\| (InVT != MVT::v4i32)))
	return SDValue();

	if (Subtarget.hasInt256())
	return DAG.getNode(X86ISD::VZEXT, dl, VT, In);

	SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
	SDValue Undef = DAG.getUNDEF(InVT);
	bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
	SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
	SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);

	MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
	VT.getVectorNumElements()/2);

	OpLo = DAG.getBitcast(HVT, OpLo);
	OpHi = DAG.getBitcast(HVT, OpHi);

	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
	}

	static SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
	const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	MVT VT = Op->getSimpleValueType(0);
	SDValue In = Op->getOperand(0);
	MVT InVT = In.getSimpleValueType();
	SDLoc DL(Op);
	unsigned NumElts = VT.getVectorNumElements();

	if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1 &&
	(NumElts == 8 \|\| NumElts == 16 \|\| Subtarget.hasBWI()))
	return DAG.getNode(X86ISD::VZEXT, DL, VT, In);

	if (InVT.getVectorElementType() != MVT::i1)
	return SDValue();

	// Extend VT if the target is 256 or 128bit vector and VLX is not supported.
	MVT ExtVT = VT;
	if (!VT.is512BitVector() && !Subtarget.hasVLX())
	ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);

	SDValue One =
	DAG.getConstant(APInt(ExtVT.getScalarSizeInBits(), 1), DL, ExtVT);
	SDValue Zero =
	DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT);

	SDValue SelectedVal = DAG.getSelect(DL, ExtVT, In, One, Zero);
	if (VT == ExtVT)
	return SelectedVal;
	return DAG.getNode(X86ISD::VTRUNC, DL, VT, SelectedVal);
	}

	static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	if (Subtarget.hasFp256())
	if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
	return Res;

	return SDValue();
	}

	static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDLoc DL(Op);
	MVT VT = Op.getSimpleValueType();
	SDValue In = Op.getOperand(0);
	MVT SVT = In.getSimpleValueType();

	if (VT.is512BitVector() \|\| SVT.getVectorElementType() == MVT::i1)
	return LowerZERO_EXTEND_AVX512(Op, Subtarget, DAG);

	if (Subtarget.hasFp256())
	if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
	return Res;

	assert(!VT.is256BitVector() \|\| !SVT.is128BitVector() \|\|
	VT.getVectorNumElements() != SVT.getVectorNumElements());
	return SDValue();
	}

	/// Helper to recursively truncate vector elements in half with PACKSS.
	/// It makes use of the fact that vector comparison results will be all-zeros
	/// or all-ones to use (vXi8 PACKSS(vYi16, vYi16)) instead of matching types.
	/// AVX2 (Int256) sub-targets require extra shuffling as the PACKSS operates
	/// within each 128-bit lane.
	static SDValue truncateVectorCompareWithPACKSS(EVT DstVT, SDValue In,
	const SDLoc &DL,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// Requires SSE2 but AVX512 has fast truncate.
	if (!Subtarget.hasSSE2() \|\| Subtarget.hasAVX512())
	return SDValue();

	EVT SrcVT = In.getValueType();

	// No truncation required, we might get here due to recursive calls.
	if (SrcVT == DstVT)
	return In;

	// We only support vector truncation to 128bits or greater from a
	// 256bits or greater source.
	if ((DstVT.getSizeInBits() % 128) != 0)
	return SDValue();
	if ((SrcVT.getSizeInBits() % 256) != 0)
	return SDValue();

	unsigned NumElems = SrcVT.getVectorNumElements();
	assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
	assert(SrcVT.getSizeInBits() > DstVT.getSizeInBits() && "Illegal truncation");

	EVT PackedSVT =
	EVT::getIntegerVT(*DAG.getContext(), SrcVT.getScalarSizeInBits() / 2);

	// Extract lower/upper subvectors.
	unsigned NumSubElts = NumElems / 2;
	unsigned SrcSizeInBits = SrcVT.getSizeInBits();
	SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
	SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);

	// 256bit -> 128bit truncate - PACKSS lower/upper 128-bit subvectors.
	if (SrcVT.is256BitVector()) {
	Lo = DAG.getBitcast(MVT::v8i16, Lo);
	Hi = DAG.getBitcast(MVT::v8i16, Hi);
	SDValue Res = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, Lo, Hi);
	return DAG.getBitcast(DstVT, Res);
	}

	// AVX2: 512bit -> 256bit truncate - PACKSS lower/upper 256-bit subvectors.
	// AVX2: 512bit -> 128bit truncate - PACKSS(PACKSS, PACKSS).
	if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
	Lo = DAG.getBitcast(MVT::v16i16, Lo);
	Hi = DAG.getBitcast(MVT::v16i16, Hi);
	SDValue Res = DAG.getNode(X86ISD::PACKSS, DL, MVT::v32i8, Lo, Hi);

	// 256-bit PACKSS(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
	// so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
	Res = DAG.getBitcast(MVT::v4i64, Res);
	Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, {0, 2, 1, 3});

	if (DstVT.is256BitVector())
	return DAG.getBitcast(DstVT, Res);

	// If 512bit -> 128bit truncate another stage.
	EVT PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems);
	Res = DAG.getBitcast(PackedVT, Res);
	return truncateVectorCompareWithPACKSS(DstVT, Res, DL, DAG, Subtarget);
	}

	// Recursively pack lower/upper subvectors, concat result and pack again.
	assert(SrcVT.getSizeInBits() >= 512 && "Expected 512-bit vector or greater");
	EVT PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems / 2);
	Lo = truncateVectorCompareWithPACKSS(PackedVT, Lo, DL, DAG, Subtarget);
	Hi = truncateVectorCompareWithPACKSS(PackedVT, Hi, DL, DAG, Subtarget);

	PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems);
	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
	return truncateVectorCompareWithPACKSS(DstVT, Res, DL, DAG, Subtarget);
	}

	static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {

	SDLoc DL(Op);
	MVT VT = Op.getSimpleValueType();
	SDValue In = Op.getOperand(0);
	MVT InVT = In.getSimpleValueType();

	assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");

	// Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
	unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
	if (InVT.getScalarSizeInBits() <= 16) {
	if (Subtarget.hasBWI()) {
	// legal, will go to VPMOVB2M, VPMOVW2M
	// Shift packed bytes not supported natively, bitcast to word
	MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
	SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT,
	DAG.getBitcast(ExtVT, In),
	DAG.getConstant(ShiftInx, DL, ExtVT));
	ShiftNode = DAG.getBitcast(InVT, ShiftNode);
	return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode);
	}
	// Use TESTD/Q, extended vector to packed dword/qword.
	assert((InVT.is256BitVector() \|\| InVT.is128BitVector()) &&
	"Unexpected vector type.");
	unsigned NumElts = InVT.getVectorNumElements();
	MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
	In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
	InVT = ExtVT;
	ShiftInx = InVT.getScalarSizeInBits() - 1;
	}

	SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In,
	DAG.getConstant(ShiftInx, DL, InVT));
	return DAG.getNode(X86ISD::TESTM, DL, VT, ShiftNode, ShiftNode);
	}

	SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
	SDLoc DL(Op);
	MVT VT = Op.getSimpleValueType();
	SDValue In = Op.getOperand(0);
	MVT InVT = In.getSimpleValueType();

	if (VT == MVT::i1) {
	assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) &&
	"Invalid scalar TRUNCATE operation");
	if (InVT.getSizeInBits() >= 32)
	return SDValue();
	In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
	return DAG.getNode(ISD::TRUNCATE, DL, VT, In);
	}
	assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
	"Invalid TRUNCATE operation");

	if (VT.getVectorElementType() == MVT::i1)
	return LowerTruncateVecI1(Op, DAG, Subtarget);

	// vpmovqb/w/d, vpmovdb/w, vpmovwb
	if (Subtarget.hasAVX512()) {
	// word to byte only under BWI
	if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) // v16i16 -> v16i8
	return DAG.getNode(X86ISD::VTRUNC, DL, VT,
	getExtendInVec(X86ISD::VSEXT, DL, MVT::v16i32, In, DAG));
	return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
	}

	// Truncate with PACKSS if we are truncating a vector zero/all-bits result.
	if (InVT.getScalarSizeInBits() == DAG.ComputeNumSignBits(In))
	if (SDValue V = truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget))
	return V;

	if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
	// On AVX2, v4i64 -> v4i32 becomes VPERMD.
	if (Subtarget.hasInt256()) {
	static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
	In = DAG.getBitcast(MVT::v8i32, In);
	In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
	DAG.getIntPtrConstant(0, DL));
	}

	SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
	DAG.getIntPtrConstant(0, DL));
	SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
	DAG.getIntPtrConstant(2, DL));
	OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
	OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
	static const int ShufMask[] = {0, 2, 4, 6};
	return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
	}

	if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
	// On AVX2, v8i32 -> v8i16 becomes PSHUFB.
	if (Subtarget.hasInt256()) {
	In = DAG.getBitcast(MVT::v32i8, In);

	// The PSHUFB mask:
	static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
	-1, -1, -1, -1, -1, -1, -1, -1,
	16, 17, 20, 21, 24, 25, 28, 29,
	-1, -1, -1, -1, -1, -1, -1, -1 };
	In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
	In = DAG.getBitcast(MVT::v4i64, In);

	static const int ShufMask2[] = {0, 2, -1, -1};
	In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
	In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
	DAG.getIntPtrConstant(0, DL));
	return DAG.getBitcast(VT, In);
	}

	SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
	DAG.getIntPtrConstant(0, DL));

	SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
	DAG.getIntPtrConstant(4, DL));

	OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
	OpHi = DAG.getBitcast(MVT::v16i8, OpHi);

	// The PSHUFB mask:
	static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
	-1, -1, -1, -1, -1, -1, -1, -1};

	OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
	OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);

	OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
	OpHi = DAG.getBitcast(MVT::v4i32, OpHi);

	// The MOVLHPS Mask:
	static const int ShufMask2[] = {0, 1, 4, 5};
	SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
	return DAG.getBitcast(MVT::v8i16, res);
	}

	// Handle truncation of V256 to V128 using shuffles.
	if (!VT.is128BitVector() \|\| !InVT.is256BitVector())
	return SDValue();

	assert(Subtarget.hasFp256() && "256-bit vector without AVX!");

	unsigned NumElems = VT.getVectorNumElements();
	MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);

	SmallVector<int, 16> MaskVec(NumElems * 2, -1);
	// Prepare truncation shuffle mask
	for (unsigned i = 0; i != NumElems; ++i)
	MaskVec[i] = i * 2;
	In = DAG.getBitcast(NVT, In);
	SDValue V = DAG.getVectorShuffle(NVT, DL, In, In, MaskVec);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
	DAG.getIntPtrConstant(0, DL));
	}

	SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
	bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
	MVT VT = Op.getSimpleValueType();

	if (VT.isVector()) {
	assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
	SDValue Src = Op.getOperand(0);
	SDLoc dl(Op);
	if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) {
	return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT,
	DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
	DAG.getUNDEF(MVT::v2f32)));
	}

	return SDValue();
	}

	assert(!VT.isVector());

	std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
	IsSigned, /IsReplace=/ false);
	SDValue FIST = Vals.first, StackSlot = Vals.second;
	// If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
	if (!FIST.getNode())
	return Op;

	if (StackSlot.getNode())
	// Load the result.
	return DAG.getLoad(VT, SDLoc(Op), FIST, StackSlot, MachinePointerInfo());

	// The node is the result.
	return FIST;
	}

	static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
	SDLoc DL(Op);
	MVT VT = Op.getSimpleValueType();
	SDValue In = Op.getOperand(0);
	MVT SVT = In.getSimpleValueType();

	assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");

	return DAG.getNode(X86ISD::VFPEXT, DL, VT,
	DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
	In, DAG.getUNDEF(SVT)));
	}

	/// The only differences between FABS and FNEG are the mask and the logic op.
	/// FNEG also has a folding opportunity for FNEG(FABS(x)).
	static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
	assert((Op.getOpcode() == ISD::FABS \|\| Op.getOpcode() == ISD::FNEG) &&
	"Wrong opcode for lowering FABS or FNEG.");

	bool IsFABS = (Op.getOpcode() == ISD::FABS);

	// If this is a FABS and it has an FNEG user, bail out to fold the combination
	// into an FNABS. We'll lower the FABS after that if it is still in use.
	if (IsFABS)
	for (SDNode *User : Op->uses())
	if (User->getOpcode() == ISD::FNEG)
	return Op;

	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();

	bool IsF128 = (VT == MVT::f128);

	// FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
	// decide if we should generate a 16-byte constant mask when we only need 4 or
	// 8 bytes for the scalar case.

	MVT LogicVT;
	MVT EltVT;

	if (VT.isVector()) {
	LogicVT = VT;
	EltVT = VT.getVectorElementType();
	} else if (IsF128) {
	// SSE instructions are used for optimized f128 logical operations.
	LogicVT = MVT::f128;
	EltVT = VT;
	} else {
	// There are no scalar bitwise logical SSE/AVX instructions, so we
	// generate a 16-byte vector constant and logic op even for the scalar case.
	// Using a 16-byte mask allows folding the load of the mask with
	// the logic op, so it can save (~4 bytes) on code size.
	LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
	EltVT = VT;
	}

	unsigned EltBits = EltVT.getSizeInBits();
	// For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
	APInt MaskElt =
	IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignMask(EltBits);
	const fltSemantics &Sem =
	EltVT == MVT::f64 ? APFloat::IEEEdouble() :
	(IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
	SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);

	SDValue Op0 = Op.getOperand(0);
	bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
	unsigned LogicOp =
	IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
	SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;

	if (VT.isVector() \|\| IsF128)
	return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);

	// For the scalar case extend to a 128-bit vector, perform the logic op,
	// and extract the scalar result back out.
	Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
	SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
	DAG.getIntPtrConstant(0, dl));
	}

	static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
	SDValue Mag = Op.getOperand(0);
	SDValue Sign = Op.getOperand(1);
	SDLoc dl(Op);

	// If the sign operand is smaller, extend it first.
	MVT VT = Op.getSimpleValueType();
	if (Sign.getSimpleValueType().bitsLT(VT))
	Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);

	// And if it is bigger, shrink it first.
	if (Sign.getSimpleValueType().bitsGT(VT))
	Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl));

	// At this point the operands and the result should have the same
	// type, and that won't be f80 since that is not custom lowered.
	bool IsF128 = (VT == MVT::f128);
	assert((VT == MVT::f64 \|\| VT == MVT::f32 \|\| VT == MVT::f128 \|\|
	VT == MVT::v2f64 \|\| VT == MVT::v4f64 \|\| VT == MVT::v4f32 \|\|
	VT == MVT::v8f32 \|\| VT == MVT::v8f64 \|\| VT == MVT::v16f32) &&
	"Unexpected type in LowerFCOPYSIGN");

	MVT EltVT = VT.getScalarType();
	const fltSemantics &Sem =
	EltVT == MVT::f64 ? APFloat::IEEEdouble()
	: (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());

	// Perform all scalar logic operations as 16-byte vectors because there are no
	// scalar FP logic instructions in SSE.
	// TODO: This isn't necessary. If we used scalar types, we might avoid some
	// unnecessary splats, but we might miss load folding opportunities. Should
	// this decision be based on OptimizeForSize?
	bool IsFakeVector = !VT.isVector() && !IsF128;
	MVT LogicVT = VT;
	if (IsFakeVector)
	LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;

	// The mask constants are automatically splatted for vector types.
	unsigned EltSizeInBits = VT.getScalarSizeInBits();
	SDValue SignMask = DAG.getConstantFP(
	APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
	SDValue MagMask = DAG.getConstantFP(
	APFloat(Sem, ~APInt::getSignMask(EltSizeInBits)), dl, LogicVT);

	// First, clear all bits but the sign bit from the second operand (sign).
	if (IsFakeVector)
	Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
	SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);

	// Next, clear the sign bit from the first operand (magnitude).
	// TODO: If we had general constant folding for FP logic ops, this check
	// wouldn't be necessary.
	SDValue MagBits;
	if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Mag)) {
	APFloat APF = Op0CN->getValueAPF();
	APF.clearSign();
	MagBits = DAG.getConstantFP(APF, dl, LogicVT);
	} else {
	// If the magnitude operand wasn't a constant, we need to AND out the sign.
	if (IsFakeVector)
	Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
	MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
	}

	// OR the magnitude value with the sign bit.
	SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
	return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
	DAG.getIntPtrConstant(0, dl));
	}

	static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
	SDValue N0 = Op.getOperand(0);
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();

	MVT OpVT = N0.getSimpleValueType();
	assert((OpVT == MVT::f32 \|\| OpVT == MVT::f64) &&
	"Unexpected type for FGETSIGN");

	// Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
	MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
	SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
	Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
	Res = DAG.getZExtOrTrunc(Res, dl, VT);
	Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
	return Res;
	}

	// Check whether an OR'd tree is PTEST-able.
	static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");

	if (!Subtarget.hasSSE41())
	return SDValue();

	if (!Op->hasOneUse())
	return SDValue();

	SDNode *N = Op.getNode();
	SDLoc DL(N);

	SmallVector<SDValue, 8> Opnds;
	DenseMap<SDValue, unsigned> VecInMap;
	SmallVector<SDValue, 8> VecIns;
	EVT VT = MVT::Other;

	// Recognize a special case where a vector is casted into wide integer to
	// test all 0s.
	Opnds.push_back(N->getOperand(0));
	Opnds.push_back(N->getOperand(1));

	for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
	SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
	// BFS traverse all OR'd operands.
	if (I->getOpcode() == ISD::OR) {
	Opnds.push_back(I->getOperand(0));
	Opnds.push_back(I->getOperand(1));
	// Re-evaluate the number of nodes to be traversed.
	e += 2; // 2 more nodes (LHS and RHS) are pushed.
	continue;
	}

	// Quit if a non-EXTRACT_VECTOR_ELT
	if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	return SDValue();

	// Quit if without a constant index.
	SDValue Idx = I->getOperand(1);
	if (!isa<ConstantSDNode>(Idx))
	return SDValue();

	SDValue ExtractedFromVec = I->getOperand(0);
	DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
	if (M == VecInMap.end()) {
	VT = ExtractedFromVec.getValueType();
	// Quit if not 128/256-bit vector.
	if (!VT.is128BitVector() && !VT.is256BitVector())
	return SDValue();
	// Quit if not the same type.
	if (VecInMap.begin() != VecInMap.end() &&
	VT != VecInMap.begin()->first.getValueType())
	return SDValue();
	M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
	VecIns.push_back(ExtractedFromVec);
	}
	M->second \|= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
	}

	assert((VT.is128BitVector() \|\| VT.is256BitVector()) &&
	"Not extracted from 128-/256-bit vector.");

	unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;

	for (DenseMap<SDValue, unsigned>::const_iterator
	I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
	// Quit if not all elements are used.
	if (I->second != FullMask)
	return SDValue();
	}

	MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;

	// Cast all vectors into TestVT for PTEST.
	for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
	VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);

	// If more than one full vector is evaluated, OR them first before PTEST.
	for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
	// Each iteration will OR 2 nodes and append the result until there is only
	// 1 node left, i.e. the final OR'd value of all vectors.
	SDValue LHS = VecIns[Slot];
	SDValue RHS = VecIns[Slot + 1];
	VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
	}

	return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back());
	}

	/// \brief return true if \c Op has a use that doesn't just read flags.
	static bool hasNonFlagsUse(SDValue Op) {
	for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
	++UI) {
	SDNode User = UI;
	unsigned UOpNo = UI.getOperandNo();
	if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
	// Look pass truncate.
	UOpNo = User->use_begin().getOperandNo();
	User = *User->use_begin();
	}

	if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
	!(User->getOpcode() == ISD::SELECT && UOpNo == 0))
	return true;
	}
	return false;
	}

	// Emit KTEST instruction for bit vectors on AVX-512
	static SDValue EmitKTEST(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (Op.getOpcode() == ISD::BITCAST) {
	auto hasKTEST = [&](MVT VT) {
	unsigned SizeInBits = VT.getSizeInBits();
	return (Subtarget.hasDQI() && (SizeInBits == 8 \|\| SizeInBits == 16)) \|\|
	(Subtarget.hasBWI() && (SizeInBits == 32 \|\| SizeInBits == 64));
	};
	SDValue Op0 = Op.getOperand(0);
	MVT Op0VT = Op0.getValueType().getSimpleVT();
	if (Op0VT.isVector() && Op0VT.getVectorElementType() == MVT::i1 &&
	hasKTEST(Op0VT))
	return DAG.getNode(X86ISD::KTEST, SDLoc(Op), Op0VT, Op0, Op0);
	}
	return SDValue();
	}

	/// Emit nodes that will be selected as "test Op0,Op0", or something
	/// equivalent.
	SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
	SelectionDAG &DAG) const {
	if (Op.getValueType() == MVT::i1) {
	SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op);
	return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp,
	DAG.getConstant(0, dl, MVT::i8));
	}
	// CF and OF aren't always set the way we want. Determine which
	// of these we need.
	bool NeedCF = false;
	bool NeedOF = false;
	switch (X86CC) {
	default: break;
	case X86::COND_A: case X86::COND_AE:
	case X86::COND_B: case X86::COND_BE:
	NeedCF = true;
	break;
	case X86::COND_G: case X86::COND_GE:
	case X86::COND_L: case X86::COND_LE:
	case X86::COND_O: case X86::COND_NO: {
	// Check if we really need to set the
	// Overflow flag. If NoSignedWrap is present
	// that is not actually needed.
	switch (Op->getOpcode()) {
	case ISD::ADD:
	case ISD::SUB:
	case ISD::MUL:
	case ISD::SHL:
	if (Op.getNode()->getFlags().hasNoSignedWrap())
	break;
	LLVM_FALLTHROUGH;
	default:
	NeedOF = true;
	break;
	}
	break;
	}
	}
	// See if we can use the EFLAGS value from the operand instead of
	// doing a separate TEST. TEST always sets OF and CF to 0, so unless
	// we prove that the arithmetic won't overflow, we can't use OF or CF.
	if (Op.getResNo() != 0 \|\| NeedOF \|\| NeedCF) {
	// Emit KTEST for bit vectors
	if (auto Node = EmitKTEST(Op, DAG, Subtarget))
	return Node;
	// Emit a CMP with 0, which is the TEST pattern.
	return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
	DAG.getConstant(0, dl, Op.getValueType()));
	}
	unsigned Opcode = 0;
	unsigned NumOperands = 0;

	// Truncate operations may prevent the merge of the SETCC instruction
	// and the arithmetic instruction before it. Attempt to truncate the operands
	// of the arithmetic instruction and use a reduced bit-width instruction.
	bool NeedTruncation = false;
	SDValue ArithOp = Op;
	if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
	SDValue Arith = Op->getOperand(0);
	// Both the trunc and the arithmetic op need to have one user each.
	if (Arith->hasOneUse())
	switch (Arith.getOpcode()) {
	default: break;
	case ISD::ADD:
	case ISD::SUB:
	case ISD::AND:
	case ISD::OR:
	case ISD::XOR: {
	NeedTruncation = true;
	ArithOp = Arith;
	}
	}
	}

	// Sometimes flags can be set either with an AND or with an SRL/SHL
	// instruction. SRL/SHL variant should be preferred for masks longer than this
	// number of bits.
	const int ShiftToAndMaxMaskWidth = 32;
	const bool ZeroCheck = (X86CC == X86::COND_E \|\| X86CC == X86::COND_NE);

	// NOTICE: In the code below we use ArithOp to hold the arithmetic operation
	// which may be the result of a CAST. We use the variable 'Op', which is the
	// non-casted variable when we check for possible users.
	switch (ArithOp.getOpcode()) {
	case ISD::ADD:
	// Due to an isel shortcoming, be conservative if this add is likely to be
	// selected as part of a load-modify-store instruction. When the root node
	// in a match is a store, isel doesn't know how to remap non-chain non-flag
	// uses of other nodes in the match, such as the ADD in this case. This
	// leads to the ADD being left around and reselected, with the result being
	// two adds in the output. Alas, even if none our users are stores, that
	// doesn't prove we're O.K. Ergo, if we have any parents that aren't
	// CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require
	// climbing the DAG back to the root, and it doesn't seem to be worth the
	// effort.
	for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
	UE = Op.getNode()->use_end(); UI != UE; ++UI)
	if (UI->getOpcode() != ISD::CopyToReg &&
	UI->getOpcode() != ISD::SETCC &&
	UI->getOpcode() != ISD::STORE)
	goto default_case;

	if (ConstantSDNode *C =
	dyn_cast<ConstantSDNode>(ArithOp.getOperand(1))) {
	// An add of one will be selected as an INC.
	if (C->isOne() && !Subtarget.slowIncDec()) {
	Opcode = X86ISD::INC;
	NumOperands = 1;
	break;
	}

	// An add of negative one (subtract of one) will be selected as a DEC.
	if (C->isAllOnesValue() && !Subtarget.slowIncDec()) {
	Opcode = X86ISD::DEC;
	NumOperands = 1;
	break;
	}
	}

	// Otherwise use a regular EFLAGS-setting add.
	Opcode = X86ISD::ADD;
	NumOperands = 2;
	break;
	case ISD::SHL:
	case ISD::SRL:
	// If we have a constant logical shift that's only used in a comparison
	// against zero turn it into an equivalent AND. This allows turning it into
	// a TEST instruction later.
	if (ZeroCheck && Op->hasOneUse() &&
	isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
	EVT VT = Op.getValueType();
	unsigned BitWidth = VT.getSizeInBits();
	unsigned ShAmt = Op->getConstantOperandVal(1);
	if (ShAmt >= BitWidth) // Avoid undefined shifts.
	break;
	APInt Mask = ArithOp.getOpcode() == ISD::SRL
	? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
	: APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
	if (!Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
	break;
	Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
	DAG.getConstant(Mask, dl, VT));
	}
	break;

	case ISD::AND:
	// If the primary 'and' result isn't used, don't bother using X86ISD::AND,
	// because a TEST instruction will be better. However, AND should be
	// preferred if the instruction can be combined into ANDN.
	if (!hasNonFlagsUse(Op)) {
	SDValue Op0 = ArithOp->getOperand(0);
	SDValue Op1 = ArithOp->getOperand(1);
	EVT VT = ArithOp.getValueType();
	bool isAndn = isBitwiseNot(Op0) \|\| isBitwiseNot(Op1);
	bool isLegalAndnType = VT == MVT::i32 \|\| VT == MVT::i64;
	bool isProperAndn = isAndn && isLegalAndnType && Subtarget.hasBMI();

	// If we cannot select an ANDN instruction, check if we can replace
	// AND+IMM64 with a shift before giving up. This is possible for masks
	// like 0xFF000000 or 0x00FFFFFF and if we care only about the zero flag.
	if (!isProperAndn) {
	if (!ZeroCheck)
	break;

	assert(!isa<ConstantSDNode>(Op0) && "AND node isn't canonicalized");
	auto *CN = dyn_cast<ConstantSDNode>(Op1);
	if (!CN)
	break;

	const APInt &Mask = CN->getAPIntValue();
	if (Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
	break; // Prefer TEST instruction.

	unsigned BitWidth = Mask.getBitWidth();
	unsigned LeadingOnes = Mask.countLeadingOnes();
	unsigned TrailingZeros = Mask.countTrailingZeros();

	if (LeadingOnes + TrailingZeros == BitWidth) {
	assert(TrailingZeros < VT.getSizeInBits() &&
	"Shift amount should be less than the type width");
	MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
	SDValue ShAmt = DAG.getConstant(TrailingZeros, dl, ShTy);
	Op = DAG.getNode(ISD::SRL, dl, VT, Op0, ShAmt);
	break;
	}

	unsigned LeadingZeros = Mask.countLeadingZeros();
	unsigned TrailingOnes = Mask.countTrailingOnes();

	if (LeadingZeros + TrailingOnes == BitWidth) {
	assert(LeadingZeros < VT.getSizeInBits() &&
	"Shift amount should be less than the type width");
	MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
	SDValue ShAmt = DAG.getConstant(LeadingZeros, dl, ShTy);
	Op = DAG.getNode(ISD::SHL, dl, VT, Op0, ShAmt);
	break;
	}

	break;
	}
	}
	LLVM_FALLTHROUGH;
	case ISD::SUB:
	case ISD::OR:
	case ISD::XOR:
	// Due to the ISEL shortcoming noted above, be conservative if this op is
	// likely to be selected as part of a load-modify-store instruction.
	for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
	UE = Op.getNode()->use_end(); UI != UE; ++UI)
	if (UI->getOpcode() == ISD::STORE)
	goto default_case;

	// Otherwise use a regular EFLAGS-setting instruction.
	switch (ArithOp.getOpcode()) {
	default: llvm_unreachable("unexpected operator!");
	case ISD::SUB: Opcode = X86ISD::SUB; break;
	case ISD::XOR: Opcode = X86ISD::XOR; break;
	case ISD::AND: Opcode = X86ISD::AND; break;
	case ISD::OR: {
	if (!NeedTruncation && ZeroCheck) {
	if (SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG))
	return EFLAGS;
	}
	Opcode = X86ISD::OR;
	break;
	}
	}

	NumOperands = 2;
	break;
	case X86ISD::ADD:
	case X86ISD::SUB:
	case X86ISD::INC:
	case X86ISD::DEC:
	case X86ISD::OR:
	case X86ISD::XOR:
	case X86ISD::AND:
	return SDValue(Op.getNode(), 1);
	default:
	default_case:
	break;
	}

	// If we found that truncation is beneficial, perform the truncation and
	// update 'Op'.
	if (NeedTruncation) {
	EVT VT = Op.getValueType();
	SDValue WideVal = Op->getOperand(0);
	EVT WideVT = WideVal.getValueType();
	unsigned ConvertedOp = 0;
	// Use a target machine opcode to prevent further DAGCombine
	// optimizations that may separate the arithmetic operations
	// from the setcc node.
	switch (WideVal.getOpcode()) {
	default: break;
	case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
	case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
	case ISD::AND: ConvertedOp = X86ISD::AND; break;
	case ISD::OR: ConvertedOp = X86ISD::OR; break;
	case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
	}

	if (ConvertedOp) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
	SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
	SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
	Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
	}
	}
	}

	if (Opcode == 0) {
	// Emit KTEST for bit vectors
	if (auto Node = EmitKTEST(Op, DAG, Subtarget))
	return Node;

	// Emit a CMP with 0, which is the TEST pattern.
	return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
	DAG.getConstant(0, dl, Op.getValueType()));
	}
	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
	SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);

	SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
	DAG.ReplaceAllUsesWith(Op, New);
	return SDValue(New.getNode(), 1);
	}

	/// Emit nodes that will be selected as "cmp Op0,Op1", or something
	/// equivalent.
	SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
	const SDLoc &dl, SelectionDAG &DAG) const {
	if (isNullConstant(Op1))
	return EmitTest(Op0, X86CC, dl, DAG);

	assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) &&
	"Unexpected comparison operation for MVT::i1 operands");

	if ((Op0.getValueType() == MVT::i8 \|\| Op0.getValueType() == MVT::i16 \|\|
	Op0.getValueType() == MVT::i32 \|\| Op0.getValueType() == MVT::i64)) {
	// Only promote the compare up to I32 if it is a 16 bit operation
	// with an immediate. 16 bit immediates are to be avoided.
	if ((Op0.getValueType() == MVT::i16 &&
	(isa<ConstantSDNode>(Op0) \|\| isa<ConstantSDNode>(Op1))) &&
	!DAG.getMachineFunction().getFunction()->optForMinSize() &&
	!Subtarget.isAtom()) {
	unsigned ExtendOp =
	isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
	Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
	Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
	}
	// Use SUB instead of CMP to enable CSE between SUB and CMP.
	SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
	SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
	Op0, Op1);
	return SDValue(Sub.getNode(), 1);
	}
	return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
	}

	/// Convert a comparison if required by the subtarget.
	SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
	SelectionDAG &DAG) const {
	// If the subtarget does not support the FUCOMI instruction, floating-point
	// comparisons have to be converted.
	if (Subtarget.hasCMov() \|\|
	Cmp.getOpcode() != X86ISD::CMP \|\|
	!Cmp.getOperand(0).getValueType().isFloatingPoint() \|\|
	!Cmp.getOperand(1).getValueType().isFloatingPoint())
	return Cmp;

	// The instruction selector will select an FUCOM instruction instead of
	// FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
	// build an SDNode sequence that transfers the result from FPSW into EFLAGS:
	// (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
	SDLoc dl(Cmp);
	SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
	SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
	SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
	DAG.getConstant(8, dl, MVT::i8));
	SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);

	// Some 64-bit targets lack SAHF support, but they do support FCOMI.
	assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
	return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
	}

	/// Check if replacement of SQRT with RSQRT should be disabled.
	bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
	EVT VT = Op.getValueType();

	// We never want to use both SQRT and RSQRT instructions for the same input.
	if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
	return false;

	if (VT.isVector())
	return Subtarget.hasFastVectorFSQRT();
	return Subtarget.hasFastScalarFSQRT();
	}

	/// The minimum architected relative accuracy is 2^-12. We need one
	/// Newton-Raphson step to have a good float result (24 bits of precision).
	SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
	SelectionDAG &DAG, int Enabled,
	int &RefinementSteps,
	bool &UseOneConstNR,
	bool Reciprocal) const {
	EVT VT = Op.getValueType();

	// SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
	// TODO: Add support for AVX512 (v16f32).
	// It is likely not profitable to do this for f64 because a double-precision
	// rsqrt estimate with refinement on x86 prior to FMA requires at least 16
	// instructions: convert to single, rsqrtss, convert back to double, refine
	// (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
	// along with FMA, this could be a throughput win.
	if ((VT == MVT::f32 && Subtarget.hasSSE1()) \|\|
	(VT == MVT::v4f32 && Subtarget.hasSSE1()) \|\|
	(VT == MVT::v8f32 && Subtarget.hasAVX())) {
	if (RefinementSteps == ReciprocalEstimate::Unspecified)
	RefinementSteps = 1;

	UseOneConstNR = false;
	return DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
	}
	return SDValue();
	}

	/// The minimum architected relative accuracy is 2^-12. We need one
	/// Newton-Raphson step to have a good float result (24 bits of precision).
	SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
	int Enabled,
	int &RefinementSteps) const {
	EVT VT = Op.getValueType();

	// SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
	// TODO: Add support for AVX512 (v16f32).
	// It is likely not profitable to do this for f64 because a double-precision
	// reciprocal estimate with refinement on x86 prior to FMA requires
	// 15 instructions: convert to single, rcpss, convert back to double, refine
	// (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
	// along with FMA, this could be a throughput win.

	if ((VT == MVT::f32 && Subtarget.hasSSE1()) \|\|
	(VT == MVT::v4f32 && Subtarget.hasSSE1()) \|\|
	(VT == MVT::v8f32 && Subtarget.hasAVX())) {
	// Enable estimate codegen with 1 refinement step for vector division.
	// Scalar division estimates are disabled because they break too much
	// real-world code. These defaults are intended to match GCC behavior.
	if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
	return SDValue();

	if (RefinementSteps == ReciprocalEstimate::Unspecified)
	RefinementSteps = 1;

	return DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
	}
	return SDValue();
	}

	/// If we have at least two divisions that use the same divisor, convert to
	/// multiplication by a reciprocal. This may need to be adjusted for a given
	/// CPU if a division's cost is not at least twice the cost of a multiplication.
	/// This is because we still need one division to calculate the reciprocal and
	/// then we need two multiplies by that reciprocal as replacements for the
	/// original divisions.
	unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
	return 2;
	}

	/// Helper for creating a X86ISD::SETCC node.
	static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
	SelectionDAG &DAG) {
	return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
	DAG.getConstant(Cond, dl, MVT::i8), EFLAGS);
	}

	/// Create a BT (Bit Test) node - Test bit \p BitNo in \p Src and set condition
	/// according to equal/not-equal condition code \p CC.
	static SDValue getBitTestCondition(SDValue Src, SDValue BitNo, ISD::CondCode CC,
	const SDLoc &dl, SelectionDAG &DAG) {
	// If Src is i8, promote it to i32 with any_extend. There is no i8 BT
	// instruction. Since the shift amount is in-range-or-undefined, we know
	// that doing a bittest on the i32 value is ok. We extend to i32 because
	// the encoding for the i16 version is larger than the i32 version.
	// Also promote i16 to i32 for performance / code size reason.
	if (Src.getValueType() == MVT::i8 \|\| Src.getValueType() == MVT::i16)
	Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);

	// See if we can use the 32-bit instruction instead of the 64-bit one for a
	// shorter encoding. Since the former takes the modulo 32 of BitNo and the
	// latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
	// known to be zero.
	if (Src.getValueType() == MVT::i64 &&
	DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
	Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);

	// If the operand types disagree, extend the shift amount to match. Since
	// BT ignores high bits (like shifts) we can use anyextend.
	if (Src.getValueType() != BitNo.getValueType())
	BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);

	SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
	X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
	return getSETCC(Cond, BT, dl , DAG);
	}

	/// Result of 'and' is compared against zero. Change to a BT node if possible.
	static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
	const SDLoc &dl, SelectionDAG &DAG) {
	SDValue Op0 = And.getOperand(0);
	SDValue Op1 = And.getOperand(1);
	if (Op0.getOpcode() == ISD::TRUNCATE)
	Op0 = Op0.getOperand(0);
	if (Op1.getOpcode() == ISD::TRUNCATE)
	Op1 = Op1.getOperand(0);

	SDValue LHS, RHS;
	if (Op1.getOpcode() == ISD::SHL)
	std::swap(Op0, Op1);
	if (Op0.getOpcode() == ISD::SHL) {
	if (isOneConstant(Op0.getOperand(0))) {
	// If we looked past a truncate, check that it's only truncating away
	// known zeros.
	unsigned BitWidth = Op0.getValueSizeInBits();
	unsigned AndBitWidth = And.getValueSizeInBits();
	if (BitWidth > AndBitWidth) {
	KnownBits Known;
	DAG.computeKnownBits(Op0, Known);
	if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
	return SDValue();
	}
	LHS = Op1;
	RHS = Op0.getOperand(1);
	}
	} else if (Op1.getOpcode() == ISD::Constant) {
	ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
	uint64_t AndRHSVal = AndRHS->getZExtValue();
	SDValue AndLHS = Op0;

	if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
	LHS = AndLHS.getOperand(0);
	RHS = AndLHS.getOperand(1);
	}

	// Use BT if the immediate can't be encoded in a TEST instruction.
	if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
	LHS = AndLHS;
	RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType());
	}
	}

	if (LHS.getNode())
	return getBitTestCondition(LHS, RHS, CC, dl, DAG);

	return SDValue();
	}

	// Convert (truncate (srl X, N) to i1) to (bt X, N)
	static SDValue LowerTruncateToBT(SDValue Op, ISD::CondCode CC,
	const SDLoc &dl, SelectionDAG &DAG) {

	assert(Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1 &&
	"Expected TRUNCATE to i1 node");

	if (Op.getOperand(0).getOpcode() != ISD::SRL)
	return SDValue();

	SDValue ShiftRight = Op.getOperand(0);
	return getBitTestCondition(ShiftRight.getOperand(0), ShiftRight.getOperand(1),
	CC, dl, DAG);
	}

	/// Result of 'and' or 'trunc to i1' is compared against zero.
	/// Change to a BT node if possible.
	SDValue X86TargetLowering::LowerToBT(SDValue Op, ISD::CondCode CC,
	const SDLoc &dl, SelectionDAG &DAG) const {
	if (Op.getOpcode() == ISD::AND)
	return LowerAndToBT(Op, CC, dl, DAG);
	if (Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1)
	return LowerTruncateToBT(Op, CC, dl, DAG);
	return SDValue();
	}

	/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
	/// CMPs.
	static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
	SDValue &Op1) {
	unsigned SSECC;
	bool Swap = false;

	// SSE Condition code mapping:
	// 0 - EQ
	// 1 - LT
	// 2 - LE
	// 3 - UNORD
	// 4 - NEQ
	// 5 - NLT
	// 6 - NLE
	// 7 - ORD
	switch (SetCCOpcode) {
	default: llvm_unreachable("Unexpected SETCC condition");
	case ISD::SETOEQ:
	case ISD::SETEQ: SSECC = 0; break;
	case ISD::SETOGT:
	case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH;
	case ISD::SETLT:
	case ISD::SETOLT: SSECC = 1; break;
	case ISD::SETOGE:
	case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH;
	case ISD::SETLE:
	case ISD::SETOLE: SSECC = 2; break;
	case ISD::SETUO: SSECC = 3; break;
	case ISD::SETUNE:
	case ISD::SETNE: SSECC = 4; break;
	case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
	case ISD::SETUGE: SSECC = 5; break;
	case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
	case ISD::SETUGT: SSECC = 6; break;
	case ISD::SETO: SSECC = 7; break;
	case ISD::SETUEQ:
	case ISD::SETONE: SSECC = 8; break;
	}
	if (Swap)
	std::swap(Op0, Op1);

	return SSECC;
	}

	/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
	/// concatenate the result back.
	static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();

	assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
	"Unsupported value type for operation");

	unsigned NumElems = VT.getVectorNumElements();
	SDLoc dl(Op);
	SDValue CC = Op.getOperand(2);

	// Extract the LHS vectors
	SDValue LHS = Op.getOperand(0);
	SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
	SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);

	// Extract the RHS vectors
	SDValue RHS = Op.getOperand(1);
	SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
	SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);

	// Issue the operation on the smaller types and concatenate the result back
	MVT EltVT = VT.getVectorElementType();
	MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
	DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
	DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
	}

	static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);
	SDValue CC = Op.getOperand(2);
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);

	assert(Op0.getSimpleValueType().getVectorElementType() == MVT::i1 &&
	"Unexpected type for boolean compare operation");
	ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
	SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0,
	DAG.getConstant(-1, dl, VT));
	SDValue NotOp1 = DAG.getNode(ISD::XOR, dl, VT, Op1,
	DAG.getConstant(-1, dl, VT));
	switch (SetCCOpcode) {
	default: llvm_unreachable("Unexpected SETCC condition");
	case ISD::SETEQ:
	// (x == y) -> ~(x ^ y)
	return DAG.getNode(ISD::XOR, dl, VT,
	DAG.getNode(ISD::XOR, dl, VT, Op0, Op1),
	DAG.getConstant(-1, dl, VT));
	case ISD::SETNE:
	// (x != y) -> (x ^ y)
	return DAG.getNode(ISD::XOR, dl, VT, Op0, Op1);
	case ISD::SETUGT:
	case ISD::SETGT:
	// (x > y) -> (x & ~y)
	return DAG.getNode(ISD::AND, dl, VT, Op0, NotOp1);
	case ISD::SETULT:
	case ISD::SETLT:
	// (x < y) -> (~x & y)
	return DAG.getNode(ISD::AND, dl, VT, NotOp0, Op1);
	case ISD::SETULE:
	case ISD::SETLE:
	// (x <= y) -> (~x \| y)
	return DAG.getNode(ISD::OR, dl, VT, NotOp0, Op1);
	case ISD::SETUGE:
	case ISD::SETGE:
	// (x >=y) -> (x \| ~y)
	return DAG.getNode(ISD::OR, dl, VT, Op0, NotOp1);
	}
	}

	static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {

	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);
	SDValue CC = Op.getOperand(2);
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);

	assert(VT.getVectorElementType() == MVT::i1 &&
	"Cannot set masked compare for this operation");

	ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
	unsigned Opc = 0;
	bool Unsigned = false;
	bool Swap = false;
	unsigned SSECC;
	switch (SetCCOpcode) {
	default: llvm_unreachable("Unexpected SETCC condition");
	case ISD::SETNE: SSECC = 4; break;
	case ISD::SETEQ: Opc = X86ISD::PCMPEQM; break;
	case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
	case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH;
	case ISD::SETGT: Opc = X86ISD::PCMPGTM; break;
	case ISD::SETULT: SSECC = 1; Unsigned = true; break;
	case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
	case ISD::SETGE: Swap = true; SSECC = 2; break; // LE + swap
	case ISD::SETULE: Unsigned = true; LLVM_FALLTHROUGH;
	case ISD::SETLE: SSECC = 2; break;
	}

	if (Swap)
	std::swap(Op0, Op1);
	if (Opc)
	return DAG.getNode(Opc, dl, VT, Op0, Op1);
	Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
	return DAG.getNode(Opc, dl, VT, Op0, Op1,
	DAG.getConstant(SSECC, dl, MVT::i8));
	}

	/// \brief Try to turn a VSETULT into a VSETULE by modifying its second
	/// operand \p Op1. If non-trivial (for example because it's not constant)
	/// return an empty value.
	static SDValue ChangeVSETULTtoVSETULE(const SDLoc &dl, SDValue Op1,
	SelectionDAG &DAG) {
	BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
	if (!BV)
	return SDValue();

	MVT VT = Op1.getSimpleValueType();
	MVT EVT = VT.getVectorElementType();
	unsigned n = VT.getVectorNumElements();
	SmallVector<SDValue, 8> ULTOp1;

	for (unsigned i = 0; i < n; ++i) {
	ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
	if (!Elt \|\| Elt->isOpaque() \|\| Elt->getSimpleValueType(0) != EVT)
	return SDValue();

	// Avoid underflow.
	APInt Val = Elt->getAPIntValue();
	if (Val == 0)
	return SDValue();

	ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT));
	}

	return DAG.getBuildVector(VT, dl, ULTOp1);
	}

	static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);
	SDValue CC = Op.getOperand(2);
	MVT VT = Op.getSimpleValueType();
	ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
	bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
	SDLoc dl(Op);

	if (isFP) {
	#ifndef NDEBUG
	MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
	assert(EltVT == MVT::f32 \|\| EltVT == MVT::f64);
	#endif

	unsigned Opc;
	if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
	assert(VT.getVectorNumElements() <= 16);
	Opc = X86ISD::CMPM;
	} else {
	Opc = X86ISD::CMPP;
	// The SSE/AVX packed FP comparison nodes are defined with a
	// floating-point vector result that matches the operand type. This allows
	// them to work with an SSE1 target (integer vector types are not legal).
	VT = Op0.getSimpleValueType();
	}

	// In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
	// emit two comparisons and a logic op to tie them together.
	// TODO: This can be avoided if Intel (and only Intel as of 2016) AVX is
	// available.
	SDValue Cmp;
	unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1);
	if (SSECC == 8) {
	// LLVM predicate is SETUEQ or SETONE.
	unsigned CC0, CC1;
	unsigned CombineOpc;
	if (Cond == ISD::SETUEQ) {
	CC0 = 3; // UNORD
	CC1 = 0; // EQ
	CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FOR) :
	static_cast<unsigned>(ISD::OR);
	} else {
	assert(Cond == ISD::SETONE);
	CC0 = 7; // ORD
	CC1 = 4; // NEQ
	CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FAND) :
	static_cast<unsigned>(ISD::AND);
	}

	SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
	DAG.getConstant(CC0, dl, MVT::i8));
	SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
	DAG.getConstant(CC1, dl, MVT::i8));
	Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
	} else {
	// Handle all other FP comparisons here.
	Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
	DAG.getConstant(SSECC, dl, MVT::i8));
	}

	// If this is SSE/AVX CMPP, bitcast the result back to integer to match the
	// result type of SETCC. The bitcast is expected to be optimized away
	// during combining/isel.
	if (Opc == X86ISD::CMPP)
	Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);

	return Cmp;
	}

	MVT VTOp0 = Op0.getSimpleValueType();
	assert(VTOp0 == Op1.getSimpleValueType() &&
	"Expected operands with same type!");
	assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
	"Invalid number of packed elements for source and destination!");

	if (VT.is128BitVector() && VTOp0.is256BitVector()) {
	// On non-AVX512 targets, a vector of MVT::i1 is promoted by the type
	// legalizer to a wider vector type. In the case of 'vsetcc' nodes, the
	// legalizer firstly checks if the first operand in input to the setcc has
	// a legal type. If so, then it promotes the return type to that same type.
	// Otherwise, the return type is promoted to the 'next legal type' which,
	// for a vector of MVT::i1 is always a 128-bit integer vector type.
	//
	// We reach this code only if the following two conditions are met:
	// 1. Both return type and operand type have been promoted to wider types
	// by the type legalizer.
	// 2. The original operand type has been promoted to a 256-bit vector.
	//
	// Note that condition 2. only applies for AVX targets.
	SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, Cond);
	return DAG.getZExtOrTrunc(NewOp, dl, VT);
	}

	// The non-AVX512 code below works under the assumption that source and
	// destination types are the same.
	assert((Subtarget.hasAVX512() \|\| (VT == VTOp0)) &&
	"Value types for source and destination must be the same!");

	// Break 256-bit integer vector compare into smaller ones.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return Lower256IntVSETCC(Op, DAG);

	// Operands are boolean (vectors of i1)
	MVT OpVT = Op1.getSimpleValueType();
	if (OpVT.getVectorElementType() == MVT::i1)
	return LowerBoolVSETCC_AVX512(Op, DAG);

	// The result is boolean, but operands are int/float
	if (VT.getVectorElementType() == MVT::i1) {
	// In AVX-512 architecture setcc returns mask with i1 elements,
	// But there is no compare instruction for i8 and i16 elements in KNL.
	// In this case use SSE compare
	bool UseAVX512Inst =
	(OpVT.is512BitVector() \|\|
	OpVT.getScalarSizeInBits() >= 32 \|\|
	(Subtarget.hasBWI() && Subtarget.hasVLX()));

	if (UseAVX512Inst)
	return LowerIntVSETCC_AVX512(Op, DAG);

	return DAG.getNode(ISD::TRUNCATE, dl, VT,
	DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
	}

	// Lower using XOP integer comparisons.
	if ((VT == MVT::v16i8 \|\| VT == MVT::v8i16 \|\|
	VT == MVT::v4i32 \|\| VT == MVT::v2i64) && Subtarget.hasXOP()) {
	// Translate compare code to XOP PCOM compare mode.
	unsigned CmpMode = 0;
	switch (Cond) {
	default: llvm_unreachable("Unexpected SETCC condition");
	case ISD::SETULT:
	case ISD::SETLT: CmpMode = 0x00; break;
	case ISD::SETULE:
	case ISD::SETLE: CmpMode = 0x01; break;
	case ISD::SETUGT:
	case ISD::SETGT: CmpMode = 0x02; break;
	case ISD::SETUGE:
	case ISD::SETGE: CmpMode = 0x03; break;
	case ISD::SETEQ: CmpMode = 0x04; break;
	case ISD::SETNE: CmpMode = 0x05; break;
	}

	// Are we comparing unsigned or signed integers?
	unsigned Opc =
	ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;

	return DAG.getNode(Opc, dl, VT, Op0, Op1,
	DAG.getConstant(CmpMode, dl, MVT::i8));
	}

	// We are handling one of the integer comparisons here. Since SSE only has
	// GT and EQ comparisons for integer, swapping operands and multiple
	// operations may be required for some comparisons.
	unsigned Opc = (Cond == ISD::SETEQ \|\| Cond == ISD::SETNE) ? X86ISD::PCMPEQ
	: X86ISD::PCMPGT;
	bool Swap = Cond == ISD::SETLT \|\| Cond == ISD::SETULT \|\|
	Cond == ISD::SETGE \|\| Cond == ISD::SETUGE;
	bool Invert = Cond == ISD::SETNE \|\|
	(Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));

	// If both operands are known non-negative, then an unsigned compare is the
	// same as a signed compare and there's no need to flip signbits.
	// TODO: We could check for more general simplifications here since we're
	// computing known bits.
	bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
	!(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));

	// Special case: Use min/max operations for SETULE/SETUGE
	MVT VET = VT.getVectorElementType();
	bool HasMinMax =
	(Subtarget.hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32)) \|\|
	(Subtarget.hasSSE2() && (VET == MVT::i8));
	bool MinMax = false;
	if (HasMinMax) {
	switch (Cond) {
	default: break;
	case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break;
	case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break;
	}

	if (MinMax)
	Swap = Invert = FlipSigns = false;
	}

	bool HasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 \|\| VET == MVT::i16);
	bool Subus = false;
	if (!MinMax && HasSubus) {
	// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
	// Op0 u<= Op1:
	// t = psubus Op0, Op1
	// pcmpeq t, <0..0>
	switch (Cond) {
	default: break;
	case ISD::SETULT: {
	// If the comparison is against a constant we can turn this into a
	// setule. With psubus, setule does not require a swap. This is
	// beneficial because the constant in the register is no longer
	// destructed as the destination so it can be hoisted out of a loop.
	// Only do this pre-AVX since vpcmp* is no longer destructive.
	if (Subtarget.hasAVX())
	break;
	if (SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG)) {
	Op1 = ULEOp1;
	Subus = true; Invert = false; Swap = false;
	}
	break;
	}
	// Psubus is better than flip-sign because it requires no inversion.
	case ISD::SETUGE: Subus = true; Invert = false; Swap = true; break;
	case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
	}

	if (Subus) {
	Opc = X86ISD::SUBUS;
	FlipSigns = false;
	}
	}

	if (Swap)
	std::swap(Op0, Op1);

	// Check that the operation in question is available (most are plain SSE2,
	// but PCMPGTQ and PCMPEQQ have different requirements).
	if (VT == MVT::v2i64) {
	if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
	assert(Subtarget.hasSSE2() && "Don't know how to lower!");

	// First cast everything to the right type.
	Op0 = DAG.getBitcast(MVT::v4i32, Op0);
	Op1 = DAG.getBitcast(MVT::v4i32, Op1);

	// Since SSE has no unsigned integer comparisons, we need to flip the sign
	// bits of the inputs before performing those operations. The lower
	// compare is always unsigned.
	SDValue SB;
	if (FlipSigns) {
	SB = DAG.getConstant(0x80000000U, dl, MVT::v4i32);
	} else {
	SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32);
	SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32);
	SB = DAG.getBuildVector(MVT::v4i32, dl, {Sign, Zero, Sign, Zero});
	}
	Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
	Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);

	// Emulate PCMPGTQ with (hi1 > hi2) \| ((hi1 == hi2) & (lo1 > lo2))
	SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
	SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);

	// Create masks for only the low parts/high parts of the 64 bit integers.
	static const int MaskHi[] = { 1, 1, 3, 3 };
	static const int MaskLo[] = { 0, 0, 2, 2 };
	SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
	SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
	SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);

	SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
	Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);

	if (Invert)
	Result = DAG.getNOT(dl, Result, MVT::v4i32);

	return DAG.getBitcast(VT, Result);
	}

	if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
	// If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
	// pcmpeqd + pshufd + pand.
	assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");

	// First cast everything to the right type.
	Op0 = DAG.getBitcast(MVT::v4i32, Op0);
	Op1 = DAG.getBitcast(MVT::v4i32, Op1);

	// Do the compare.
	SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);

	// Make sure the lower and upper halves are both all-ones.
	static const int Mask[] = { 1, 0, 3, 2 };
	SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
	Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);

	if (Invert)
	Result = DAG.getNOT(dl, Result, MVT::v4i32);

	return DAG.getBitcast(VT, Result);
	}
	}

	// Since SSE has no unsigned integer comparisons, we need to flip the sign
	// bits of the inputs before performing those operations.
	if (FlipSigns) {
	MVT EltVT = VT.getVectorElementType();
	SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
	VT);
	Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
	Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
	}

	SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);

	// If the logical-not of the result is required, perform that now.
	if (Invert)
	Result = DAG.getNOT(dl, Result, VT);

	if (MinMax)
	Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);

	if (Subus)
	Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
	getZeroVector(VT, Subtarget, DAG, dl));

	return Result;
	}

	SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {

	MVT VT = Op.getSimpleValueType();

	if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);

	assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);
	SDLoc dl(Op);
	ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();

	// Optimize to BT if possible.
	// Lower (X & (1 << N)) == 0 to BT(X, N).
	// Lower ((X >>u N) & 1) != 0 to BT(X, N).
	// Lower ((X >>s N) & 1) != 0 to BT(X, N).
	// Lower (trunc (X >> N) to i1) to BT(X, N).
	if (Op0.hasOneUse() && isNullConstant(Op1) &&
	(CC == ISD::SETEQ \|\| CC == ISD::SETNE)) {
	if (SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG)) {
	if (VT == MVT::i1)
	return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC);
	return NewSetCC;
	}
	}

	// Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
	// these.
	if ((isOneConstant(Op1) \|\| isNullConstant(Op1)) &&
	(CC == ISD::SETEQ \|\| CC == ISD::SETNE)) {

	// If the input is a setcc, then reuse the input setcc or use a new one with
	// the inverted condition.
	if (Op0.getOpcode() == X86ISD::SETCC) {
	X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
	bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
	if (!Invert)
	return Op0;

	CCode = X86::GetOppositeBranchCondition(CCode);
	SDValue SetCC = getSETCC(CCode, Op0.getOperand(1), dl, DAG);
	if (VT == MVT::i1)
	return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
	return SetCC;
	}
	}
	if (Op0.getValueType() == MVT::i1 && (CC == ISD::SETEQ \|\| CC == ISD::SETNE)) {
	if (isOneConstant(Op1)) {
	ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
	return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, dl, MVT::i1), NewCC);
	}
	if (!isNullConstant(Op1)) {
	SDValue Xor = DAG.getNode(ISD::XOR, dl, MVT::i1, Op0, Op1);
	return DAG.getSetCC(dl, VT, Xor, DAG.getConstant(0, dl, MVT::i1), CC);
	}
	}

	bool IsFP = Op1.getSimpleValueType().isFloatingPoint();
	X86::CondCode X86CC = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG);
	if (X86CC == X86::COND_INVALID)
	return SDValue();

	SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
	EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
	SDValue SetCC = getSETCC(X86CC, EFLAGS, dl, DAG);
	if (VT == MVT::i1)
	return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
	return SetCC;
	}

	SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	SDValue Carry = Op.getOperand(2);
	SDValue Cond = Op.getOperand(3);
	SDLoc DL(Op);

	assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
	X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());

	// Recreate the carry if needed.
	EVT CarryVT = Carry.getValueType();
	APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
	Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
	Carry, DAG.getConstant(NegOne, DL, CarryVT));

	SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
	SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
	SDValue SetCC = getSETCC(CC, Cmp.getValue(1), DL, DAG);
	if (Op.getSimpleValueType() == MVT::i1)
	return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
	return SetCC;
	}

	/// Return true if opcode is a X86 logical comparison.
	static bool isX86LogicalCmp(SDValue Op) {
	unsigned Opc = Op.getOpcode();
	if (Opc == X86ISD::CMP \|\| Opc == X86ISD::COMI \|\| Opc == X86ISD::UCOMI \|\|
	Opc == X86ISD::SAHF)
	return true;
	if (Op.getResNo() == 1 &&
	(Opc == X86ISD::ADD \|\| Opc == X86ISD::SUB \|\| Opc == X86ISD::ADC \|\|
	Opc == X86ISD::SBB \|\| Opc == X86ISD::SMUL \|\| Opc == X86ISD::UMUL \|\|
	Opc == X86ISD::INC \|\| Opc == X86ISD::DEC \|\| Opc == X86ISD::OR \|\|
	Opc == X86ISD::XOR \|\| Opc == X86ISD::AND))
	return true;

	if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
	return true;

	return false;
	}

	static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
	if (V.getOpcode() != ISD::TRUNCATE)
	return false;

	SDValue VOp0 = V.getOperand(0);
	unsigned InBits = VOp0.getValueSizeInBits();
	unsigned Bits = V.getValueSizeInBits();
	return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
	}

	SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
	bool AddTest = true;
	SDValue Cond = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);
	SDValue Op2 = Op.getOperand(2);
	SDLoc DL(Op);
	MVT VT = Op1.getSimpleValueType();
	SDValue CC;

	// Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
	// are available or VBLENDV if AVX is available.
	// Otherwise FP cmovs get lowered into a less efficient branch sequence later.
	if (Cond.getOpcode() == ISD::SETCC &&
	((Subtarget.hasSSE2() && (VT == MVT::f32 \|\| VT == MVT::f64)) \|\|
	(Subtarget.hasSSE1() && VT == MVT::f32)) &&
	VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
	SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
	int SSECC = translateX86FSETCC(
	cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);

	if (SSECC != 8) {
	if (Subtarget.hasAVX512()) {
	SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0,
	CondOp1, DAG.getConstant(SSECC, DL, MVT::i8));
	return DAG.getNode(VT.isVector() ? X86ISD::SELECT : X86ISD::SELECTS,
	DL, VT, Cmp, Op1, Op2);
	}

	SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
	DAG.getConstant(SSECC, DL, MVT::i8));

	// If we have AVX, we can use a variable vector select (VBLENDV) instead
	// of 3 logic instructions for size savings and potentially speed.
	// Unfortunately, there is no scalar form of VBLENDV.

	// If either operand is a constant, don't try this. We can expect to
	// optimize away at least one of the logic instructions later in that
	// case, so that sequence would be faster than a variable blend.

	// BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
	// uses XMM0 as the selection register. That may need just as many
	// instructions as the AND/ANDN/OR sequence due to register moves, so
	// don't bother.

	if (Subtarget.hasAVX() &&
	!isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {

	// Convert to vectors, do a VSELECT, and convert back to scalar.
	// All of the conversions should be optimized away.

	MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
	SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
	SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
	SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);

	MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
	VCmp = DAG.getBitcast(VCmpVT, VCmp);

	SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);

	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
	VSel, DAG.getIntPtrConstant(0, DL));
	}
	SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
	SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
	return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
	}
	}

	// AVX512 fallback is to lower selects of scalar floats to masked moves.
	if ((VT == MVT::f64 \|\| VT == MVT::f32) && Subtarget.hasAVX512()) {
	SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
	return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
	}

	if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
	SDValue Op1Scalar;
	if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
	Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
	else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
	Op1Scalar = Op1.getOperand(0);
	SDValue Op2Scalar;
	if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
	Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
	else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
	Op2Scalar = Op2.getOperand(0);
	if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
	SDValue newSelect = DAG.getSelect(DL, Op1Scalar.getValueType(), Cond,
	Op1Scalar, Op2Scalar);
	if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
	return DAG.getBitcast(VT, newSelect);
	SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
	DAG.getIntPtrConstant(0, DL));
	}
	}

	if (VT == MVT::v4i1 \|\| VT == MVT::v2i1) {
	SDValue zeroConst = DAG.getIntPtrConstant(0, DL);
	Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
	DAG.getUNDEF(MVT::v8i1), Op1, zeroConst);
	Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
	DAG.getUNDEF(MVT::v8i1), Op2, zeroConst);
	SDValue newSelect = DAG.getSelect(DL, MVT::v8i1, Cond, Op1, Op2);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
	}

	if (Cond.getOpcode() == ISD::SETCC) {
	if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
	Cond = NewCond;
	// If the condition was updated, it's possible that the operands of the
	// select were also updated (for example, EmitTest has a RAUW). Refresh
	// the local references to the select operands in case they got stale.
	Op1 = Op.getOperand(1);
	Op2 = Op.getOperand(2);
	}
	}

	// (select (x == 0), -1, y) -> (sign_bit (x - 1)) \| y
	// (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) \| y
	// (select (x != 0), y, -1) -> (sign_bit (x - 1)) \| y
	// (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) \| y
	// (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
	// (select (and (x , 0x1) == 0), y, (z \| y) ) -> (-(and (x , 0x1)) & z ) \| y
	if (Cond.getOpcode() == X86ISD::SETCC &&
	Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
	isNullConstant(Cond.getOperand(1).getOperand(1))) {
	SDValue Cmp = Cond.getOperand(1);
	unsigned CondCode =
	cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();

	if ((isAllOnesConstant(Op1) \|\| isAllOnesConstant(Op2)) &&
	(CondCode == X86::COND_E \|\| CondCode == X86::COND_NE)) {
	SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
	SDValue CmpOp0 = Cmp.getOperand(0);

	// Apply further optimizations for special cases
	// (select (x != 0), -1, 0) -> neg & sbb
	// (select (x == 0), 0, -1) -> neg & sbb
	if (isNullConstant(Y) &&
	(isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
	SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
	SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
	SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, Zero, CmpOp0);
	SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
	DAG.getConstant(X86::COND_B, DL, MVT::i8),
	SDValue(Neg.getNode(), 1));
	return Res;
	}

	Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
	CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
	Cmp = ConvertCmpIfNecessary(Cmp, DAG);

	SDValue Res = // Res = 0 or -1.
	DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
	DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp);

	if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
	Res = DAG.getNOT(DL, Res, Res.getValueType());

	if (!isNullConstant(Op2))
	Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
	return Res;
	} else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
	Cmp.getOperand(0).getOpcode() == ISD::AND &&
	isOneConstant(Cmp.getOperand(0).getOperand(1))) {
	SDValue CmpOp0 = Cmp.getOperand(0);
	SDValue Src1, Src2;
	// true if Op2 is XOR or OR operator and one of its operands
	// is equal to Op1
	// ( a , a op b) \|\| ( b , a op b)
	auto isOrXorPattern = [&]() {
	if ((Op2.getOpcode() == ISD::XOR \|\| Op2.getOpcode() == ISD::OR) &&
	(Op2.getOperand(0) == Op1 \|\| Op2.getOperand(1) == Op1)) {
	Src1 =
	Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
	Src2 = Op1;
	return true;
	}
	return false;
	};

	if (isOrXorPattern()) {
	SDValue Neg;
	unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
	// we need mask of all zeros or ones with same size of the other
	// operands.
	if (CmpSz > VT.getSizeInBits())
	Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
	else if (CmpSz < VT.getSizeInBits())
	Neg = DAG.getNode(ISD::AND, DL, VT,
	DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
	DAG.getConstant(1, DL, VT));
	else
	Neg = CmpOp0;
	SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
	Neg); // -(and (x, 0x1))
	SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
	return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
	}
	}
	}

	// Look past (and (setcc_carry (cmp ...)), 1).
	if (Cond.getOpcode() == ISD::AND &&
	Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
	isOneConstant(Cond.getOperand(1)))
	Cond = Cond.getOperand(0);

	// If condition flag is set by a X86ISD::CMP, then use it as the condition
	// setting operand in place of the X86ISD::SETCC.
	unsigned CondOpcode = Cond.getOpcode();
	if (CondOpcode == X86ISD::SETCC \|\|
	CondOpcode == X86ISD::SETCC_CARRY) {
	CC = Cond.getOperand(0);

	SDValue Cmp = Cond.getOperand(1);
	unsigned Opc = Cmp.getOpcode();
	MVT VT = Op.getSimpleValueType();

	bool IllegalFPCMov = false;
	if (VT.isFloatingPoint() && !VT.isVector() &&
	!isScalarFPTypeInSSEReg(VT)) // FPStack?
	IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());

	if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) \|\|
	Opc == X86ISD::BT) { // FIXME
	Cond = Cmp;
	AddTest = false;
	}
	} else if (CondOpcode == ISD::USUBO \|\| CondOpcode == ISD::SSUBO \|\|
	CondOpcode == ISD::UADDO \|\| CondOpcode == ISD::SADDO \|\|
	((CondOpcode == ISD::UMULO \|\| CondOpcode == ISD::SMULO) &&
	Cond.getOperand(0).getValueType() != MVT::i8)) {
	SDValue LHS = Cond.getOperand(0);
	SDValue RHS = Cond.getOperand(1);
	unsigned X86Opcode;
	unsigned X86Cond;
	SDVTList VTs;
	switch (CondOpcode) {
	case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
	case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
	case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
	case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
	case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
	case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
	default: llvm_unreachable("unexpected overflowing operator");
	}
	if (CondOpcode == ISD::UMULO)
	VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
	MVT::i32);
	else
	VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);

	SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);

	if (CondOpcode == ISD::UMULO)
	Cond = X86Op.getValue(2);
	else
	Cond = X86Op.getValue(1);

	CC = DAG.getConstant(X86Cond, DL, MVT::i8);
	AddTest = false;
	}

	if (AddTest) {
	// Look past the truncate if the high bits are known zero.
	if (isTruncWithZeroHighBitsInput(Cond, DAG))
	Cond = Cond.getOperand(0);

	// We know the result of AND is compared against zero. Try to match
	// it to BT.
	if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
	if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG)) {
	CC = NewSetCC.getOperand(0);
	Cond = NewSetCC.getOperand(1);
	AddTest = false;
	}
	}
	}

	if (AddTest) {
	CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
	Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
	}

	// a < b ? -1 : 0 -> RES = ~setcc_carry
	// a < b ? 0 : -1 -> RES = setcc_carry
	// a >= b ? -1 : 0 -> RES = setcc_carry
	// a >= b ? 0 : -1 -> RES = ~setcc_carry
	if (Cond.getOpcode() == X86ISD::SUB) {
	Cond = ConvertCmpIfNecessary(Cond, DAG);
	unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();

	if ((CondCode == X86::COND_AE \|\| CondCode == X86::COND_B) &&
	(isAllOnesConstant(Op1) \|\| isAllOnesConstant(Op2)) &&
	(isNullConstant(Op1) \|\| isNullConstant(Op2))) {
	SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
	DAG.getConstant(X86::COND_B, DL, MVT::i8),
	Cond);
	if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
	return DAG.getNOT(DL, Res, Res.getValueType());
	return Res;
	}
	}

	// X86 doesn't have an i8 cmov. If both operands are the result of a truncate
	// widen the cmov and push the truncate through. This avoids introducing a new
	// branch during isel and doesn't add any extensions.
	if (Op.getValueType() == MVT::i8 &&
	Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
	SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
	if (T1.getValueType() == T2.getValueType() &&
	// Blacklist CopyFromReg to avoid partial register stalls.
	T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
	SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
	SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
	return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
	}
	}

	// X86ISD::CMOV means set the result (which is operand 1) to the RHS if
	// condition is true.
	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
	SDValue Ops[] = { Op2, Op1, CC, Cond };
	return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
	}

	static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op->getSimpleValueType(0);
	SDValue In = Op->getOperand(0);
	MVT InVT = In.getSimpleValueType();
	MVT VTElt = VT.getVectorElementType();
	MVT InVTElt = InVT.getVectorElementType();
	SDLoc dl(Op);

	// SKX processor
	if ((InVTElt == MVT::i1) &&
	(((Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16)) \|\|

	((Subtarget.hasDQI() && VTElt.getSizeInBits() >= 32))))

	return DAG.getNode(X86ISD::VSEXT, dl, VT, In);

	unsigned NumElts = VT.getVectorNumElements();

	if (VT.is512BitVector() && InVTElt != MVT::i1 &&
	(NumElts == 8 \|\| NumElts == 16 \|\| Subtarget.hasBWI())) {
	if (In.getOpcode() == X86ISD::VSEXT \|\| In.getOpcode() == X86ISD::VZEXT)
	return getExtendInVec(In.getOpcode(), dl, VT, In.getOperand(0), DAG);
	return getExtendInVec(X86ISD::VSEXT, dl, VT, In, DAG);
	}

	if (InVTElt != MVT::i1)
	return SDValue();

	MVT ExtVT = VT;
	if (!VT.is512BitVector() && !Subtarget.hasVLX())
	ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);

	SDValue V;
	if (Subtarget.hasDQI()) {
	V = getExtendInVec(X86ISD::VSEXT, dl, ExtVT, In, DAG);
	assert(!VT.is512BitVector() && "Unexpected vector type");
	} else {
	SDValue NegOne = getOnesVector(ExtVT, DAG, dl);
	SDValue Zero = getZeroVector(ExtVT, Subtarget, DAG, dl);
	V = DAG.getSelect(dl, ExtVT, In, NegOne, Zero);
	if (ExtVT == VT)
	return V;
	}

	return DAG.getNode(X86ISD::VTRUNC, dl, VT, V);
	}

	// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
	// For sign extend this needs to handle all vector sizes and SSE4.1 and
	// non-SSE4.1 targets. For zero extend this should only handle inputs of
	// MVT::v64i8 when BWI is not supported, but AVX512 is.
	static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue In = Op->getOperand(0);
	MVT VT = Op->getSimpleValueType(0);
	MVT InVT = In.getSimpleValueType();
	assert(VT.getSizeInBits() == InVT.getSizeInBits());

	MVT SVT = VT.getVectorElementType();
	MVT InSVT = InVT.getVectorElementType();
	assert(SVT.getSizeInBits() > InSVT.getSizeInBits());

	if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
	return SDValue();
	if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
	return SDValue();
	if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
	!(VT.is256BitVector() && Subtarget.hasInt256()) &&
	!(VT.is512BitVector() && Subtarget.hasAVX512()))
	return SDValue();

	SDLoc dl(Op);

	// For 256-bit vectors, we only need the lower (128-bit) half of the input.
	// For 512-bit vectors, we need 128-bits or 256-bits.
	if (VT.getSizeInBits() > 128) {
	// Input needs to be at least the same number of elements as output, and
	// at least 128-bits.
	int InSize = InSVT.getSizeInBits() * VT.getVectorNumElements();
	In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
	}

	assert((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG \|\|
	InVT == MVT::v64i8) && "Zero extend only for v64i8 input!");

	// SSE41 targets can use the pmovsx* instructions directly for 128-bit results,
	// so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
	// need to be handled here for 256/512-bit results.
	if (Subtarget.hasInt256()) {
	assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
	unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ?
	X86ISD::VSEXT : X86ISD::VZEXT;
	return DAG.getNode(ExtOpc, dl, VT, In);
	}

	// We should only get here for sign extend.
	assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
	"Unexpected opcode!");

	// pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
	SDValue Curr = In;
	MVT CurrVT = InVT;

	// As SRAI is only available on i16/i32 types, we expand only up to i32
	// and handle i64 separately.
	while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) {
	Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
	MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
	CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
	Curr = DAG.getBitcast(CurrVT, Curr);
	}

	SDValue SignExt = Curr;
	if (CurrVT != InVT) {
	unsigned SignExtShift =
	CurrVT.getScalarSizeInBits() - InSVT.getSizeInBits();
	SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
	DAG.getConstant(SignExtShift, dl, MVT::i8));
	}

	if (CurrVT == VT)
	return SignExt;

	if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) {
	SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
	DAG.getConstant(31, dl, MVT::i8));
	SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5});
	return DAG.getBitcast(VT, Ext);
	}

	return SDValue();
	}

	static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op->getSimpleValueType(0);
	SDValue In = Op->getOperand(0);
	MVT InVT = In.getSimpleValueType();
	SDLoc dl(Op);

	if (VT.is512BitVector() \|\| InVT.getVectorElementType() == MVT::i1)
	return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG);

	if ((VT != MVT::v4i64 \|\| InVT != MVT::v4i32) &&
	(VT != MVT::v8i32 \|\| InVT != MVT::v8i16) &&
	(VT != MVT::v16i16 \|\| InVT != MVT::v16i8))
	return SDValue();

	if (Subtarget.hasInt256())
	return DAG.getNode(X86ISD::VSEXT, dl, VT, In);

	// Optimize vectors in AVX mode
	// Sign extend v8i16 to v8i32 and
	// v4i32 to v4i64
	//
	// Divide input vector into two parts
	// for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
	// use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
	// concat the vectors to original VT

	unsigned NumElems = InVT.getVectorNumElements();
	SDValue Undef = DAG.getUNDEF(InVT);

	SmallVector<int,8> ShufMask1(NumElems, -1);
	for (unsigned i = 0; i != NumElems/2; ++i)
	ShufMask1[i] = i;

	SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask1);

	SmallVector<int,8> ShufMask2(NumElems, -1);
	for (unsigned i = 0; i != NumElems/2; ++i)
	ShufMask2[i] = i + NumElems/2;

	SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask2);

	MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
	VT.getVectorNumElements() / 2);

	OpLo = DAG.getSignExtendVectorInReg(OpLo, dl, HalfVT);
	OpHi = DAG.getSignExtendVectorInReg(OpHi, dl, HalfVT);

	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
	}

	// Lower truncating store. We need a special lowering to vXi1 vectors
	static SDValue LowerTruncatingStore(SDValue StOp, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	StoreSDNode *St = cast<StoreSDNode>(StOp.getNode());
	SDLoc dl(St);
	EVT MemVT = St->getMemoryVT();
	assert(St->isTruncatingStore() && "We only custom truncating store.");
	assert(MemVT.isVector() && MemVT.getVectorElementType() == MVT::i1 &&
	"Expected truncstore of i1 vector");

	SDValue Op = St->getValue();
	MVT OpVT = Op.getValueType().getSimpleVT();
	unsigned NumElts = OpVT.getVectorNumElements();
	if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) \|\|
	NumElts == 16) {
	// Truncate and store - everything is legal
	Op = DAG.getNode(ISD::TRUNCATE, dl, MemVT, Op);
	if (MemVT.getSizeInBits() < 8)
	Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
	DAG.getUNDEF(MVT::v8i1), Op,
	DAG.getIntPtrConstant(0, dl));
	return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
	St->getMemOperand());
	}

	// A subset, assume that we have only AVX-512F
	if (NumElts <= 8) {
	if (NumElts < 8) {
	// Extend to 8-elts vector
	MVT ExtVT = MVT::getVectorVT(OpVT.getScalarType(), 8);
	Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ExtVT,
	DAG.getUNDEF(ExtVT), Op, DAG.getIntPtrConstant(0, dl));
	}
	Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i1, Op);
	return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
	St->getMemOperand());
	}
	// v32i8
	assert(OpVT == MVT::v32i8 && "Unexpected operand type");
	// Divide the vector into 2 parts and store each part separately
	SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
	DAG.getIntPtrConstant(0, dl));
	Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Lo);
	SDValue BasePtr = St->getBasePtr();
	SDValue StLo = DAG.getStore(St->getChain(), dl, Lo, BasePtr,
	St->getMemOperand());
	SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
	DAG.getIntPtrConstant(16, dl));
	Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Hi);

	SDValue BasePtrHi =
	DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
	DAG.getConstant(2, dl, BasePtr.getValueType()));

	SDValue StHi = DAG.getStore(St->getChain(), dl, Hi,
	BasePtrHi, St->getMemOperand());
	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StLo, StHi);
	}

	static SDValue LowerExtended1BitVectorLoad(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {

	LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
	SDLoc dl(Ld);
	EVT MemVT = Ld->getMemoryVT();
	assert(MemVT.isVector() && MemVT.getScalarType() == MVT::i1 &&
	"Expected i1 vector load");
	unsigned ExtOpcode = Ld->getExtensionType() == ISD::ZEXTLOAD ?
	ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
	MVT VT = Op.getValueType().getSimpleVT();
	unsigned NumElts = VT.getVectorNumElements();

	if ((Subtarget.hasBWI() && NumElts >= 32) \|\|
	(Subtarget.hasDQI() && NumElts < 16) \|\|
	NumElts == 16) {
	// Load and extend - everything is legal
	if (NumElts < 8) {
	SDValue Load = DAG.getLoad(MVT::v8i1, dl, Ld->getChain(),
	Ld->getBasePtr(),
	Ld->getMemOperand());
	// Replace chain users with the new chain.
	assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
	MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
	SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, Load);

	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
	DAG.getIntPtrConstant(0, dl));
	}
	SDValue Load = DAG.getLoad(MemVT, dl, Ld->getChain(),
	Ld->getBasePtr(),
	Ld->getMemOperand());
	// Replace chain users with the new chain.
	assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));

	// Finally, do a normal sign-extend to the desired register.
	return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Load);
	}

	if (NumElts <= 8) {
	// A subset, assume that we have only AVX-512F
	unsigned NumBitsToLoad = 8;
	MVT TypeToLoad = MVT::getIntegerVT(NumBitsToLoad);
	SDValue Load = DAG.getLoad(TypeToLoad, dl, Ld->getChain(),
	Ld->getBasePtr(),
	Ld->getMemOperand());
	// Replace chain users with the new chain.
	assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));

	MVT MaskVT = MVT::getVectorVT(MVT::i1, NumBitsToLoad);
	SDValue BitVec = DAG.getBitcast(MaskVT, Load);

	if (NumElts == 8)
	return DAG.getNode(ExtOpcode, dl, VT, BitVec);

	// we should take care to v4i1 and v2i1

	MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
	SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, BitVec);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
	DAG.getIntPtrConstant(0, dl));
	}

	assert(VT == MVT::v32i8 && "Unexpected extload type");

	SmallVector<SDValue, 2> Chains;

	SDValue BasePtr = Ld->getBasePtr();
	SDValue LoadLo = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
	Ld->getBasePtr(),
	Ld->getMemOperand());
	Chains.push_back(LoadLo.getValue(1));

	SDValue BasePtrHi =
	DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
	DAG.getConstant(2, dl, BasePtr.getValueType()));

	SDValue LoadHi = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
	BasePtrHi,
	Ld->getMemOperand());
	Chains.push_back(LoadHi.getValue(1));
	SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);

	SDValue Lo = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadLo);
	SDValue Hi = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadHi);
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v32i8, Lo, Hi);
	}

	// Lower vector extended loads using a shuffle. If SSSE3 is not available we
	// may emit an illegal shuffle but the expansion is still better than scalar
	// code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
	// we'll emit a shuffle and a arithmetic shift.
	// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
	// TODO: It is possible to support ZExt by zeroing the undef values during
	// the shuffle phase or after the shuffle.
	static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT RegVT = Op.getSimpleValueType();
	assert(RegVT.isVector() && "We only custom lower vector sext loads.");
	assert(RegVT.isInteger() &&
	"We only custom lower integer vector sext loads.");

	// Nothing useful we can do without SSE2 shuffles.
	assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");

	LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
	SDLoc dl(Ld);
	EVT MemVT = Ld->getMemoryVT();
	if (MemVT.getScalarType() == MVT::i1)
	return LowerExtended1BitVectorLoad(Op, Subtarget, DAG);

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	unsigned RegSz = RegVT.getSizeInBits();

	ISD::LoadExtType Ext = Ld->getExtensionType();

	assert((Ext == ISD::EXTLOAD \|\| Ext == ISD::SEXTLOAD)
	&& "Only anyext and sext are currently implemented.");
	assert(MemVT != RegVT && "Cannot extend to the same type");
	assert(MemVT.isVector() && "Must load a vector from memory");

	unsigned NumElems = RegVT.getVectorNumElements();
	unsigned MemSz = MemVT.getSizeInBits();
	assert(RegSz > MemSz && "Register size must be greater than the mem size");

	if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) {
	// The only way in which we have a legal 256-bit vector result but not the
	// integer 256-bit operations needed to directly lower a sextload is if we
	// have AVX1 but not AVX2. In that case, we can always emit a sextload to
	// a 128-bit vector and a normal sign_extend to 256-bits that should get
	// correctly legalized. We do this late to allow the canonical form of
	// sextload to persist throughout the rest of the DAG combiner -- it wants
	// to fold together any extensions it can, and so will fuse a sign_extend
	// of an sextload into a sextload targeting a wider value.
	SDValue Load;
	if (MemSz == 128) {
	// Just switch this to a normal load.
	assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
	"it must be a legal 128-bit vector "
	"type!");
	Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
	Ld->getPointerInfo(), Ld->getAlignment(),
	Ld->getMemOperand()->getFlags());
	} else {
	assert(MemSz < 128 &&
	"Can't extend a type wider than 128 bits to a 256 bit vector!");
	// Do an sext load to a 128-bit vector type. We want to use the same
	// number of elements, but elements half as wide. This will end up being
	// recursively lowered by this routine, but will succeed as we definitely
	// have all the necessary features if we're using AVX1.
	EVT HalfEltVT =
	EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
	EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
	Load =
	DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
	Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
	Ld->getMemOperand()->getFlags());
	}

	// Replace chain users with the new chain.
	assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));

	// Finally, do a normal sign-extend to the desired register.
	return DAG.getSExtOrTrunc(Load, dl, RegVT);
	}

	// All sizes must be a power of two.
	assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
	"Non-power-of-two elements are not custom lowered!");

	// Attempt to load the original value using scalar loads.
	// Find the largest scalar type that divides the total loaded size.
	MVT SclrLoadTy = MVT::i8;
	for (MVT Tp : MVT::integer_valuetypes()) {
	if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
	SclrLoadTy = Tp;
	}
	}

	// On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
	if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
	(64 <= MemSz))
	SclrLoadTy = MVT::f64;

	// Calculate the number of scalar loads that we need to perform
	// in order to load our vector from memory.
	unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();

	assert((Ext != ISD::SEXTLOAD \|\| NumLoads == 1) &&
	"Can only lower sext loads with a single scalar load!");

	unsigned loadRegZize = RegSz;
	if (Ext == ISD::SEXTLOAD && RegSz >= 256)
	loadRegZize = 128;

	// Represent our vector as a sequence of elements which are the
	// largest scalar that we can load.
	EVT LoadUnitVecVT = EVT::getVectorVT(
	*DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());

	// Represent the data using the same element type that is stored in
	// memory. In practice, we ''widen'' MemVT.
	EVT WideVecVT =
	EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
	loadRegZize / MemVT.getScalarSizeInBits());

	assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
	"Invalid vector type");

	// We can't shuffle using an illegal type.
	assert(TLI.isTypeLegal(WideVecVT) &&
	"We only lower types that form legal widened vector types");

	SmallVector<SDValue, 8> Chains;
	SDValue Ptr = Ld->getBasePtr();
	SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl,
	TLI.getPointerTy(DAG.getDataLayout()));
	SDValue Res = DAG.getUNDEF(LoadUnitVecVT);

	for (unsigned i = 0; i < NumLoads; ++i) {
	// Perform a single load.
	SDValue ScalarLoad =
	DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
	Ld->getAlignment(), Ld->getMemOperand()->getFlags());
	Chains.push_back(ScalarLoad.getValue(1));
	// Create the first element type using SCALAR_TO_VECTOR in order to avoid
	// another round of DAGCombining.
	if (i == 0)
	Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
	else
	Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
	ScalarLoad, DAG.getIntPtrConstant(i, dl));

	Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
	}

	SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);

	// Bitcast the loaded value to a vector of the original element type, in
	// the size of the target vector type.
	SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
	unsigned SizeRatio = RegSz / MemSz;

	if (Ext == ISD::SEXTLOAD) {
	// If we have SSE4.1, we can directly emit a VSEXT node.
	if (Subtarget.hasSSE41()) {
	SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, RegVT, SlicedVec, DAG);
	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
	return Sext;
	}

	// Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest
	// lanes.
	assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) &&
	"We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!");

	SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);
	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
	return Shuff;
	}

	// Redistribute the loaded elements into the different locations.
	SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
	for (unsigned i = 0; i != NumElems; ++i)
	ShuffleVec[i * SizeRatio] = i;

	SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
	DAG.getUNDEF(WideVecVT), ShuffleVec);

	// Bitcast to the requested type.
	Shuff = DAG.getBitcast(RegVT, Shuff);
	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
	return Shuff;
	}

	/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
	/// each of which has no other use apart from the AND / OR.
	static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
	Opc = Op.getOpcode();
	if (Opc != ISD::OR && Opc != ISD::AND)
	return false;
	return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
	Op.getOperand(0).hasOneUse() &&
	Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
	Op.getOperand(1).hasOneUse());
	}

	/// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the
	/// SETCC node has a single use.
	static bool isXor1OfSetCC(SDValue Op) {
	if (Op.getOpcode() != ISD::XOR)
	return false;
	if (isOneConstant(Op.getOperand(1)))
	return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
	Op.getOperand(0).hasOneUse();
	return false;
	}

	SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
	bool addTest = true;
	SDValue Chain = Op.getOperand(0);
	SDValue Cond = Op.getOperand(1);
	SDValue Dest = Op.getOperand(2);
	SDLoc dl(Op);
	SDValue CC;
	bool Inverted = false;

	if (Cond.getOpcode() == ISD::SETCC) {
	// Check for setcc([su]{add,sub,mul}o == 0).
	if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
	isNullConstant(Cond.getOperand(1)) &&
	Cond.getOperand(0).getResNo() == 1 &&
	(Cond.getOperand(0).getOpcode() == ISD::SADDO \|\|
	Cond.getOperand(0).getOpcode() == ISD::UADDO \|\|
	Cond.getOperand(0).getOpcode() == ISD::SSUBO \|\|
	Cond.getOperand(0).getOpcode() == ISD::USUBO \|\|
	Cond.getOperand(0).getOpcode() == ISD::SMULO \|\|
	Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
	Inverted = true;
	Cond = Cond.getOperand(0);
	} else {
	if (SDValue NewCond = LowerSETCC(Cond, DAG))
	Cond = NewCond;
	}
	}
	#if 0
	// FIXME: LowerXALUO doesn't handle these!!
	else if (Cond.getOpcode() == X86ISD::ADD \|\|
	Cond.getOpcode() == X86ISD::SUB \|\|
	Cond.getOpcode() == X86ISD::SMUL \|\|
	Cond.getOpcode() == X86ISD::UMUL)
	Cond = LowerXALUO(Cond, DAG);
	#endif

	// Look pass (and (setcc_carry (cmp ...)), 1).
	if (Cond.getOpcode() == ISD::AND &&
	Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
	isOneConstant(Cond.getOperand(1)))
	Cond = Cond.getOperand(0);

	// If condition flag is set by a X86ISD::CMP, then use it as the condition
	// setting operand in place of the X86ISD::SETCC.
	unsigned CondOpcode = Cond.getOpcode();
	if (CondOpcode == X86ISD::SETCC \|\|
	CondOpcode == X86ISD::SETCC_CARRY) {
	CC = Cond.getOperand(0);

	SDValue Cmp = Cond.getOperand(1);
	unsigned Opc = Cmp.getOpcode();
	// FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
	if (isX86LogicalCmp(Cmp) \|\| Opc == X86ISD::BT) {
	Cond = Cmp;
	addTest = false;
	} else {
	switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
	default: break;
	case X86::COND_O:
	case X86::COND_B:
	// These can only come from an arithmetic instruction with overflow,
	// e.g. SADDO, UADDO.
	Cond = Cond.getOperand(1);
	addTest = false;
	break;
	}
	}
	}
	CondOpcode = Cond.getOpcode();
	if (CondOpcode == ISD::UADDO \|\| CondOpcode == ISD::SADDO \|\|
	CondOpcode == ISD::USUBO \|\| CondOpcode == ISD::SSUBO \|\|
	((CondOpcode == ISD::UMULO \|\| CondOpcode == ISD::SMULO) &&
	Cond.getOperand(0).getValueType() != MVT::i8)) {
	SDValue LHS = Cond.getOperand(0);
	SDValue RHS = Cond.getOperand(1);
	unsigned X86Opcode;
	unsigned X86Cond;
	SDVTList VTs;
	// Keep this in sync with LowerXALUO, otherwise we might create redundant
	// instructions that can't be removed afterwards (i.e. X86ISD::ADD and
	// X86ISD::INC).
	switch (CondOpcode) {
	case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
	case ISD::SADDO:
	if (isOneConstant(RHS)) {
	X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
	break;
	}
	X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
	case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
	case ISD::SSUBO:
	if (isOneConstant(RHS)) {
	X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
	break;
	}
	X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
	case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
	case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
	default: llvm_unreachable("unexpected overflowing operator");
	}
	if (Inverted)
	X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
	if (CondOpcode == ISD::UMULO)
	VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
	MVT::i32);
	else
	VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);

	SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);

	if (CondOpcode == ISD::UMULO)
	Cond = X86Op.getValue(2);
	else
	Cond = X86Op.getValue(1);

	CC = DAG.getConstant(X86Cond, dl, MVT::i8);
	addTest = false;
	} else {
	unsigned CondOpc;
	if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
	SDValue Cmp = Cond.getOperand(0).getOperand(1);
	if (CondOpc == ISD::OR) {
	// Also, recognize the pattern generated by an FCMP_UNE. We can emit
	// two branches instead of an explicit OR instruction with a
	// separate test.
	if (Cmp == Cond.getOperand(1).getOperand(1) &&
	isX86LogicalCmp(Cmp)) {
	CC = Cond.getOperand(0).getOperand(0);
	Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
	Chain, Dest, CC, Cmp);
	CC = Cond.getOperand(1).getOperand(0);
	Cond = Cmp;
	addTest = false;
	}
	} else { // ISD::AND
	// Also, recognize the pattern generated by an FCMP_OEQ. We can emit
	// two branches instead of an explicit AND instruction with a
	// separate test. However, we only do this if this block doesn't
	// have a fall-through edge, because this requires an explicit
	// jmp when the condition is false.
	if (Cmp == Cond.getOperand(1).getOperand(1) &&
	isX86LogicalCmp(Cmp) &&
	Op.getNode()->hasOneUse()) {
	X86::CondCode CCode =
	(X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
	CCode = X86::GetOppositeBranchCondition(CCode);
	CC = DAG.getConstant(CCode, dl, MVT::i8);
	SDNode User = Op.getNode()->use_begin();
	// Look for an unconditional branch following this conditional branch.
	// We need this because we need to reverse the successors in order
	// to implement FCMP_OEQ.
	if (User->getOpcode() == ISD::BR) {
	SDValue FalseBB = User->getOperand(1);
	SDNode *NewBR =
	DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
	assert(NewBR == User);
	(void)NewBR;
	Dest = FalseBB;

	Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
	Chain, Dest, CC, Cmp);
	X86::CondCode CCode =
	(X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
	CCode = X86::GetOppositeBranchCondition(CCode);
	CC = DAG.getConstant(CCode, dl, MVT::i8);
	Cond = Cmp;
	addTest = false;
	}
	}
	}
	} else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
	// Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
	// It should be transformed during dag combiner except when the condition
	// is set by a arithmetics with overflow node.
	X86::CondCode CCode =
	(X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
	CCode = X86::GetOppositeBranchCondition(CCode);
	CC = DAG.getConstant(CCode, dl, MVT::i8);
	Cond = Cond.getOperand(0).getOperand(1);
	addTest = false;
	} else if (Cond.getOpcode() == ISD::SETCC &&
	cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
	// For FCMP_OEQ, we can emit
	// two branches instead of an explicit AND instruction with a
	// separate test. However, we only do this if this block doesn't
	// have a fall-through edge, because this requires an explicit
	// jmp when the condition is false.
	if (Op.getNode()->hasOneUse()) {
	SDNode User = Op.getNode()->use_begin();
	// Look for an unconditional branch following this conditional branch.
	// We need this because we need to reverse the successors in order
	// to implement FCMP_OEQ.
	if (User->getOpcode() == ISD::BR) {
	SDValue FalseBB = User->getOperand(1);
	SDNode *NewBR =
	DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
	assert(NewBR == User);
	(void)NewBR;
	Dest = FalseBB;

	SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
	Cond.getOperand(0), Cond.getOperand(1));
	Cmp = ConvertCmpIfNecessary(Cmp, DAG);
	CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
	Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
	Chain, Dest, CC, Cmp);
	CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
	Cond = Cmp;
	addTest = false;
	}
	}
	} else if (Cond.getOpcode() == ISD::SETCC &&
	cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
	// For FCMP_UNE, we can emit
	// two branches instead of an explicit AND instruction with a
	// separate test. However, we only do this if this block doesn't
	// have a fall-through edge, because this requires an explicit
	// jmp when the condition is false.
	if (Op.getNode()->hasOneUse()) {
	SDNode User = Op.getNode()->use_begin();
	// Look for an unconditional branch following this conditional branch.
	// We need this because we need to reverse the successors in order
	// to implement FCMP_UNE.
	if (User->getOpcode() == ISD::BR) {
	SDValue FalseBB = User->getOperand(1);
	SDNode *NewBR =
	DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
	assert(NewBR == User);
	(void)NewBR;

	SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
	Cond.getOperand(0), Cond.getOperand(1));
	Cmp = ConvertCmpIfNecessary(Cmp, DAG);
	CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
	Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
	Chain, Dest, CC, Cmp);
	CC = DAG.getConstant(X86::COND_NP, dl, MVT::i8);
	Cond = Cmp;
	addTest = false;
	Dest = FalseBB;
	}
	}
	}
	}

	if (addTest) {
	// Look pass the truncate if the high bits are known zero.
	if (isTruncWithZeroHighBitsInput(Cond, DAG))
	Cond = Cond.getOperand(0);

	// We know the result is compared against zero. Try to match it to BT.
	if (Cond.hasOneUse()) {
	if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG)) {
	CC = NewSetCC.getOperand(0);
	Cond = NewSetCC.getOperand(1);
	addTest = false;
	}
	}
	}

	if (addTest) {
	X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
	CC = DAG.getConstant(X86Cond, dl, MVT::i8);
	Cond = EmitTest(Cond, X86Cond, dl, DAG);
	}
	Cond = ConvertCmpIfNecessary(Cond, DAG);
	return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
	Chain, Dest, CC, Cond);
	}

	// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
	// Calls to _alloca are needed to probe the stack when allocating more than 4k
	// bytes in one go. Touching the stack at 4K increments is necessary to ensure
	// that the guard pages used by the OS virtual memory manager are allocated in
	// correct sequence.
	SDValue
	X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
	SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	bool SplitStack = MF.shouldSplitStack();
	bool EmitStackProbe = !getStackProbeSymbolName(MF).empty();
	bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) \|\|
	SplitStack \|\| EmitStackProbe;
	SDLoc dl(Op);

	// Get the inputs.
	SDNode *Node = Op.getNode();
	SDValue Chain = Op.getOperand(0);
	SDValue Size = Op.getOperand(1);
	unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
	EVT VT = Node->getValueType(0);

	// Chain the dynamic stack allocation so that it doesn't modify the stack
	// pointer when other instructions are using the stack.
	Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);

	bool Is64Bit = Subtarget.is64Bit();
	MVT SPTy = getPointerTy(DAG.getDataLayout());

	SDValue Result;
	if (!Lower) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
	assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
	" not tell us which reg is the stack pointer!");

	SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
	Chain = SP.getValue(1);
	const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
	unsigned StackAlign = TFI.getStackAlignment();
	Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
	if (Align > StackAlign)
	Result = DAG.getNode(ISD::AND, dl, VT, Result,
	DAG.getConstant(-(uint64_t)Align, dl, VT));
	Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
	} else if (SplitStack) {
	MachineRegisterInfo &MRI = MF.getRegInfo();

	if (Is64Bit) {
	// The 64 bit implementation of segmented stacks needs to clobber both r10
	// r11. This makes it impossible to use it along with nested parameters.
	const Function *F = MF.getFunction();
	for (const auto &A : F->args()) {
	if (A.hasNestAttr())
	report_fatal_error("Cannot use segmented stacks with functions that "
	"have nested arguments.");
	}
	}

	const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
	unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
	Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
	Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
	DAG.getRegister(Vreg, SPTy));
	} else {
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
	Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
	MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);

	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	unsigned SPReg = RegInfo->getStackRegister();
	SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
	Chain = SP.getValue(1);

	if (Align) {
	SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
	DAG.getConstant(-(uint64_t)Align, dl, VT));
	Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
	}

	Result = SP;
	}

	Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
	DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);

	SDValue Ops[2] = {Result, Chain};
	return DAG.getMergeValues(Ops, dl);
	}

	SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	auto PtrVT = getPointerTy(MF.getDataLayout());
	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

	const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
	SDLoc DL(Op);

	if (!Subtarget.is64Bit() \|\|
	Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv())) {
	// vastart just stores the address of the VarArgsFrameIndex slot into the
	// memory location argument.
	SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
	return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
	MachinePointerInfo(SV));
	}

	// __va_list_tag:
	// gp_offset (0 - 6 * 8)
	// fp_offset (48 - 48 + 8 * 16)
	// overflow_arg_area (point to parameters coming in memory).
	// reg_save_area
	SmallVector<SDValue, 8> MemOps;
	SDValue FIN = Op.getOperand(1);
	// Store gp_offset
	SDValue Store = DAG.getStore(
	Op.getOperand(0), DL,
	DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
	MachinePointerInfo(SV));
	MemOps.push_back(Store);

	// Store fp_offset
	FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
	Store = DAG.getStore(
	Op.getOperand(0), DL,
	DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
	MachinePointerInfo(SV, 4));
	MemOps.push_back(Store);

	// Store ptr to overflow_arg_area
	FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
	SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
	Store =
	DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
	MemOps.push_back(Store);

	// Store ptr to reg_save_area.
	FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
	Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
	SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
	Store = DAG.getStore(
	Op.getOperand(0), DL, RSFIN, FIN,
	MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
	MemOps.push_back(Store);
	return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
	}

	SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
	assert(Subtarget.is64Bit() &&
	"LowerVAARG only handles 64-bit va_arg!");
	assert(Op.getNumOperands() == 4);

	MachineFunction &MF = DAG.getMachineFunction();
	if (Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv()))
	// The Win64 ABI uses char* instead of a structure.
	return DAG.expandVAArg(Op.getNode());

	SDValue Chain = Op.getOperand(0);
	SDValue SrcPtr = Op.getOperand(1);
	const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
	unsigned Align = Op.getConstantOperandVal(3);
	SDLoc dl(Op);

	EVT ArgVT = Op.getNode()->getValueType(0);
	Type ArgTy = ArgVT.getTypeForEVT(DAG.getContext());
	uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
	uint8_t ArgMode;

	// Decide which area this value should be read from.
	// TODO: Implement the AMD64 ABI in its entirety. This simple
	// selection mechanism works only for the basic types.
	if (ArgVT == MVT::f80) {
	llvm_unreachable("va_arg for f80 not yet implemented");
	} else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /bytes/) {
	ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
	} else if (ArgVT.isInteger() && ArgSize <= 32 /bytes/) {
	ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
	} else {
	llvm_unreachable("Unhandled argument type in LowerVAARG");
	}

	if (ArgMode == 2) {
	// Sanity Check: Make sure using fp_offset makes sense.
	assert(!Subtarget.useSoftFloat() &&
	!(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) &&
	Subtarget.hasSSE1());
	}

	// Insert VAARG_64 node into the DAG
	// VAARG_64 returns two values: Variable Argument Address, Chain
	SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
	DAG.getConstant(ArgMode, dl, MVT::i8),
	DAG.getConstant(Align, dl, MVT::i32)};
	SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
	SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
	VTs, InstOps, MVT::i64,
	MachinePointerInfo(SV),
	/Align=/0,
	/Volatile=/false,
	/ReadMem=/true,
	/WriteMem=/true);
	Chain = VAARG.getValue(1);

	// Load the next argument and return it
	return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
	}

	static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	// X86-64 va_list is a struct { i32, i32, i8, i8 }, except on Windows,
	// where a va_list is still an i8*.
	assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
	if (Subtarget.isCallingConvWin64(
	DAG.getMachineFunction().getFunction()->getCallingConv()))
	// Probably a Win64 va_copy.
	return DAG.expandVACopy(Op.getNode());

	SDValue Chain = Op.getOperand(0);
	SDValue DstPtr = Op.getOperand(1);
	SDValue SrcPtr = Op.getOperand(2);
	const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
	const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
	SDLoc DL(Op);

	return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
	DAG.getIntPtrConstant(24, DL), 8, /isVolatile/false,
	false, false,
	MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
	}

	/// Handle vector element shifts where the shift amount is a constant.
	/// Takes immediate version of shift as input.
	static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
	SDValue SrcOp, uint64_t ShiftAmt,
	SelectionDAG &DAG) {
	MVT ElementType = VT.getVectorElementType();

	// Bitcast the source vector to the output type, this is mainly necessary for
	// vXi8/vXi64 shifts.
	if (VT != SrcOp.getSimpleValueType())
	SrcOp = DAG.getBitcast(VT, SrcOp);

	// Fold this packed shift into its first operand if ShiftAmt is 0.
	if (ShiftAmt == 0)
	return SrcOp;

	// Check for ShiftAmt >= element width
	if (ShiftAmt >= ElementType.getSizeInBits()) {
	if (Opc == X86ISD::VSRAI)
	ShiftAmt = ElementType.getSizeInBits() - 1;
	else
	return DAG.getConstant(0, dl, VT);
	}

	assert((Opc == X86ISD::VSHLI \|\| Opc == X86ISD::VSRLI \|\| Opc == X86ISD::VSRAI)
	&& "Unknown target vector shift-by-constant node");

	// Fold this packed vector shift into a build vector if SrcOp is a
	// vector of Constants or UNDEFs.
	if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
	SmallVector<SDValue, 8> Elts;
	unsigned NumElts = SrcOp->getNumOperands();
	ConstantSDNode *ND;

	switch(Opc) {
	default: llvm_unreachable("Unknown opcode!");
	case X86ISD::VSHLI:
	for (unsigned i=0; i!=NumElts; ++i) {
	SDValue CurrentOp = SrcOp->getOperand(i);
	if (CurrentOp->isUndef()) {
	Elts.push_back(CurrentOp);
	continue;
	}
	ND = cast<ConstantSDNode>(CurrentOp);
	const APInt &C = ND->getAPIntValue();
	Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
	}
	break;
	case X86ISD::VSRLI:
	for (unsigned i=0; i!=NumElts; ++i) {
	SDValue CurrentOp = SrcOp->getOperand(i);
	if (CurrentOp->isUndef()) {
	Elts.push_back(CurrentOp);
	continue;
	}
	ND = cast<ConstantSDNode>(CurrentOp);
	const APInt &C = ND->getAPIntValue();
	Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
	}
	break;
	case X86ISD::VSRAI:
	for (unsigned i=0; i!=NumElts; ++i) {
	SDValue CurrentOp = SrcOp->getOperand(i);
	if (CurrentOp->isUndef()) {
	Elts.push_back(CurrentOp);
	continue;
	}
	ND = cast<ConstantSDNode>(CurrentOp);
	const APInt &C = ND->getAPIntValue();
	Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
	}
	break;
	}

	return DAG.getBuildVector(VT, dl, Elts);
	}

	return DAG.getNode(Opc, dl, VT, SrcOp,
	DAG.getConstant(ShiftAmt, dl, MVT::i8));
	}

	/// Handle vector element shifts where the shift amount may or may not be a
	/// constant. Takes immediate version of shift as input.
	static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
	SDValue SrcOp, SDValue ShAmt,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT SVT = ShAmt.getSimpleValueType();
	assert((SVT == MVT::i32 \|\| SVT == MVT::i64) && "Unexpected value type!");

	// Catch shift-by-constant.
	if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
	return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
	CShAmt->getZExtValue(), DAG);

	// Change opcode to non-immediate version
	switch (Opc) {
	default: llvm_unreachable("Unknown target vector shift node");
	case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
	case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
	case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
	}

	// Need to build a vector containing shift amount.
	// SSE/AVX packed shifts only use the lower 64-bit of the shift count.
	// +=================+============+=======================================+
	// \| ShAmt is \| HasSSE4.1? \| Construct ShAmt vector as \|
	// +=================+============+=======================================+
	// \| i64 \| Yes, No \| Use ShAmt as lowest elt \|
	// \| i32 \| Yes \| zero-extend in-reg \|
	// \| (i32 zext(i16)) \| Yes \| zero-extend in-reg \|
	// \| i16/i32 \| No \| v4i32 build_vector(ShAmt, 0, ud, ud)) \|
	// +=================+============+=======================================+

	if (SVT == MVT::i64)
	ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
	else if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
	ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
	ShAmt = ShAmt.getOperand(0);
	ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v8i16, ShAmt);
	ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
	} else if (Subtarget.hasSSE41() &&
	ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
	ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
	ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
	} else {
	SmallVector<SDValue, 4> ShOps = {ShAmt, DAG.getConstant(0, dl, SVT),
	DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)};
	ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
	}

	// The return type has to be a 128-bit type with the same element
	// type as the input type.
	MVT EltVT = VT.getVectorElementType();
	MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());

	ShAmt = DAG.getBitcast(ShVT, ShAmt);
	return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
	}

	/// \brief Return Mask with the necessary casting or extending
	/// for \p Mask according to \p MaskVT when lowering masking intrinsics
	static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
	const X86Subtarget &Subtarget, SelectionDAG &DAG,
	const SDLoc &dl) {

	if (isAllOnesConstant(Mask))
	return DAG.getTargetConstant(1, dl, MaskVT);
	if (X86::isZeroNode(Mask))
	return DAG.getTargetConstant(0, dl, MaskVT);

	if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
	// Mask should be extended
	Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
	MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask);
	}

	if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
	if (MaskVT == MVT::v64i1) {
	assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
	// In case 32bit mode, bitcast i64 is illegal, extend/split it.
	SDValue Lo, Hi;
	Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
	DAG.getConstant(0, dl, MVT::i32));
	Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
	DAG.getConstant(1, dl, MVT::i32));

	Lo = DAG.getBitcast(MVT::v32i1, Lo);
	Hi = DAG.getBitcast(MVT::v32i1, Hi);

	return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
	} else {
	// MaskVT require < 64bit. Truncate mask (should succeed in any case),
	// and bitcast.
	MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits());
	return DAG.getBitcast(MaskVT,
	DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask));
	}

	} else {
	MVT BitcastVT = MVT::getVectorVT(MVT::i1,
	Mask.getSimpleValueType().getSizeInBits());
	// In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
	// are extracted by EXTRACT_SUBVECTOR.
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
	DAG.getBitcast(BitcastVT, Mask),
	DAG.getIntPtrConstant(0, dl));
	}
	}

	/// \brief Return (and \p Op, \p Mask) for compare instructions or
	/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
	/// necessary casting or extending for \p Mask when lowering masking intrinsics
	static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
	SDValue PreservedSrc,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
	unsigned OpcodeSelect = ISD::VSELECT;
	SDLoc dl(Op);

	if (isAllOnesConstant(Mask))
	return Op;

	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

	switch (Op.getOpcode()) {
	default: break;
	case X86ISD::PCMPEQM:
	case X86ISD::PCMPGTM:
	case X86ISD::CMPM:
	case X86ISD::CMPMU:
	return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
	case X86ISD::VFPCLASS:
	case X86ISD::VFPCLASSS:
	return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
	case X86ISD::VTRUNC:
	case X86ISD::VTRUNCS:
	case X86ISD::VTRUNCUS:
	case X86ISD::CVTPS2PH:
	// We can't use ISD::VSELECT here because it is not always "Legal"
	// for the destination type. For example vpmovqb require only AVX512
	// and vselect that can operate on byte element type require BWI
	OpcodeSelect = X86ISD::SELECT;
	break;
	}
	if (PreservedSrc.isUndef())
	PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
	return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
	}

	/// \brief Creates an SDNode for a predicated scalar operation.
	/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
	/// The mask is coming as MVT::i8 and it should be transformed
	/// to MVT::v1i1 while lowering masking intrinsics.
	/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
	/// "X86select" instead of "vselect". We just can't create the "vselect" node
	/// for a scalar instruction.
	static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
	SDValue PreservedSrc,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {

	if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
	if (MaskConst->getZExtValue() & 0x1)
	return Op;

	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);

	SDValue IMask = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Mask);
	if (Op.getOpcode() == X86ISD::FSETCCM \|\|
	Op.getOpcode() == X86ISD::FSETCCM_RND)
	return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
	if (Op.getOpcode() == X86ISD::VFPCLASSS)
	return DAG.getNode(ISD::OR, dl, VT, Op, IMask);

	if (PreservedSrc.isUndef())
	PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
	return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
	}

	static int getSEHRegistrationNodeSize(const Function *Fn) {
	if (!Fn->hasPersonalityFn())
	report_fatal_error(
	"querying registration node size for function without personality");
	// The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
	// WinEHStatePass for the full struct definition.
	switch (classifyEHPersonality(Fn->getPersonalityFn())) {
	case EHPersonality::MSVC_X86SEH: return 24;
	case EHPersonality::MSVC_CXX: return 16;
	default: break;
	}
	report_fatal_error(
	"can only recover FP for 32-bit MSVC EH personality functions");
	}

	/// When the MSVC runtime transfers control to us, either to an outlined
	/// function or when returning to a parent frame after catching an exception, we
	/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
	/// Here's the math:
	/// RegNodeBase = EntryEBP - RegNodeSize
	/// ParentFP = RegNodeBase - ParentFrameOffset
	/// Subtracting RegNodeSize takes us to the offset of the registration node, and
	/// subtracting the offset (negative on x86) takes us back to the parent FP.
	static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
	SDValue EntryEBP) {
	MachineFunction &MF = DAG.getMachineFunction();
	SDLoc dl;

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());

	// It's possible that the parent function no longer has a personality function
	// if the exceptional code was optimized away, in which case we just return
	// the incoming EBP.
	if (!Fn->hasPersonalityFn())
	return EntryEBP;

	// Get an MCSymbol that will ultimately resolve to the frame offset of the EH
	// registration, or the .set_setframe offset.
	MCSymbol *OffsetSym =
	MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
	GlobalValue::dropLLVMManglingEscape(Fn->getName()));
	SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
	SDValue ParentFrameOffset =
	DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);

	// Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
	// prologue to RBP in the parent function.
	const X86Subtarget &Subtarget =
	static_cast<const X86Subtarget &>(DAG.getSubtarget());
	if (Subtarget.is64Bit())
	return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);

	int RegNodeSize = getSEHRegistrationNodeSize(Fn);
	// RegNodeBase = EntryEBP - RegNodeSize
	// ParentFP = RegNodeBase - ParentFrameOffset
	SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
	DAG.getConstant(RegNodeSize, dl, PtrVT));
	return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
	}

	static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	// Helper to detect if the operand is CUR_DIRECTION rounding mode.
	auto isRoundModeCurDirection = [](SDValue Rnd) {
	if (!isa<ConstantSDNode>(Rnd))
	return false;

	unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
	return Round == X86::STATIC_ROUNDING::CUR_DIRECTION;
	};

	SDLoc dl(Op);
	unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	MVT VT = Op.getSimpleValueType();
	const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
	if (IntrData) {
	switch(IntrData->Type) {
	case INTR_TYPE_1OP:
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
	case INTR_TYPE_2OP:
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
	Op.getOperand(2));
	case INTR_TYPE_3OP:
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
	Op.getOperand(2), Op.getOperand(3));
	case INTR_TYPE_4OP:
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
	Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
	case INTR_TYPE_1OP_MASK_RM: {
	SDValue Src = Op.getOperand(1);
	SDValue PassThru = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);
	SDValue RoundingMode;
	// We always add rounding mode to the Node.
	// If the rounding mode is not specified, we add the
	// "current direction" mode.
	if (Op.getNumOperands() == 4)
	RoundingMode =
	DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
	else
	RoundingMode = Op.getOperand(4);
	assert(IntrData->Opc1 == 0 && "Unexpected second opcode!");
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
	RoundingMode),
	Mask, PassThru, Subtarget, DAG);
	}
	case INTR_TYPE_1OP_MASK: {
	SDValue Src = Op.getOperand(1);
	SDValue PassThru = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);
	// We add rounding mode to the Node when
	// - RM Opcode is specified and
	// - RM is not "current direction".
	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	if (IntrWithRoundingModeOpcode != 0) {
	SDValue Rnd = Op.getOperand(4);
	if (!isRoundModeCurDirection(Rnd)) {
	return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
	dl, Op.getValueType(),
	Src, Rnd),
	Mask, PassThru, Subtarget, DAG);
	}
	}
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
	Mask, PassThru, Subtarget, DAG);
	}
	case INTR_TYPE_SCALAR_MASK: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue passThru = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);
	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	if (IntrWithRoundingModeOpcode != 0) {
	SDValue Rnd = Op.getOperand(5);
	if (!isRoundModeCurDirection(Rnd))
	return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
	dl, VT, Src1, Src2, Rnd),
	Mask, passThru, Subtarget, DAG);
	}
	return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2),
	Mask, passThru, Subtarget, DAG);
	}
	case INTR_TYPE_SCALAR_MASK_RM: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src0 = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);
	// There are 2 kinds of intrinsics in this group:
	// (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
	// (2) With rounding mode and sae - 7 operands.
	if (Op.getNumOperands() == 6) {
	SDValue Sae = Op.getOperand(5);
	return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
	Sae),
	Mask, Src0, Subtarget, DAG);
	}
	assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
	SDValue RoundingMode = Op.getOperand(5);
	SDValue Sae = Op.getOperand(6);
	return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
	RoundingMode, Sae),
	Mask, Src0, Subtarget, DAG);
	}
	case INTR_TYPE_2OP_MASK:
	case INTR_TYPE_2OP_IMM8_MASK: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue PassThru = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);

	if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK)
	Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);

	// We specify 2 possible opcodes for intrinsics with rounding modes.
	// First, we check if the intrinsic may have non-default rounding mode,
	// (IntrData->Opc1 != 0), then we check the rounding mode operand.
	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	if (IntrWithRoundingModeOpcode != 0) {
	SDValue Rnd = Op.getOperand(5);
	if (!isRoundModeCurDirection(Rnd)) {
	return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
	dl, Op.getValueType(),
	Src1, Src2, Rnd),
	Mask, PassThru, Subtarget, DAG);
	}
	}
	// TODO: Intrinsics should have fast-math-flags to propagate.
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2),
	Mask, PassThru, Subtarget, DAG);
	}
	case INTR_TYPE_2OP_MASK_RM: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue PassThru = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);
	// We specify 2 possible modes for intrinsics, with/without rounding
	// modes.
	// First, we check if the intrinsic have rounding mode (6 operands),
	// if not, we set rounding mode to "current".
	SDValue Rnd;
	if (Op.getNumOperands() == 6)
	Rnd = Op.getOperand(5);
	else
	Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
	Src1, Src2, Rnd),
	Mask, PassThru, Subtarget, DAG);
	}
	case INTR_TYPE_3OP_SCALAR_MASK_RM: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src3 = Op.getOperand(3);
	SDValue PassThru = Op.getOperand(4);
	SDValue Mask = Op.getOperand(5);
	SDValue Sae = Op.getOperand(6);

	return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
	Src2, Src3, Sae),
	Mask, PassThru, Subtarget, DAG);
	}
	case INTR_TYPE_3OP_MASK_RM: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Imm = Op.getOperand(3);
	SDValue PassThru = Op.getOperand(4);
	SDValue Mask = Op.getOperand(5);
	// We specify 2 possible modes for intrinsics, with/without rounding
	// modes.
	// First, we check if the intrinsic have rounding mode (7 operands),
	// if not, we set rounding mode to "current".
	SDValue Rnd;
	if (Op.getNumOperands() == 7)
	Rnd = Op.getOperand(6);
	else
	Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
	Src1, Src2, Imm, Rnd),
	Mask, PassThru, Subtarget, DAG);
	}
	case INTR_TYPE_3OP_IMM8_MASK:
	case INTR_TYPE_3OP_MASK: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src3 = Op.getOperand(3);
	SDValue PassThru = Op.getOperand(4);
	SDValue Mask = Op.getOperand(5);

	if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK)
	Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);

	// We specify 2 possible opcodes for intrinsics with rounding modes.
	// First, we check if the intrinsic may have non-default rounding mode,
	// (IntrData->Opc1 != 0), then we check the rounding mode operand.
	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	if (IntrWithRoundingModeOpcode != 0) {
	SDValue Rnd = Op.getOperand(6);
	if (!isRoundModeCurDirection(Rnd)) {
	return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
	dl, Op.getValueType(),
	Src1, Src2, Src3, Rnd),
	Mask, PassThru, Subtarget, DAG);
	}
	}
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
	Src1, Src2, Src3),
	Mask, PassThru, Subtarget, DAG);
	}
	case VPERM_2OP_MASK : {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue PassThru = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);

	// Swap Src1 and Src2 in the node creation
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1),
	Mask, PassThru, Subtarget, DAG);
	}
	case VPERM_3OP_MASKZ:
	case VPERM_3OP_MASK:{
	MVT VT = Op.getSimpleValueType();
	// Src2 is the PassThru
	SDValue Src1 = Op.getOperand(1);
	// PassThru needs to be the same type as the destination in order
	// to pattern match correctly.
	SDValue Src2 = DAG.getBitcast(VT, Op.getOperand(2));
	SDValue Src3 = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);
	SDValue PassThru = SDValue();

	// set PassThru element
	if (IntrData->Type == VPERM_3OP_MASKZ)
	PassThru = getZeroVector(VT, Subtarget, DAG, dl);
	else
	PassThru = Src2;

	// Swap Src1 and Src2 in the node creation
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
	dl, Op.getValueType(),
	Src2, Src1, Src3),
	Mask, PassThru, Subtarget, DAG);
	}
	case FMA_OP_MASK3:
	case FMA_OP_MASKZ:
	case FMA_OP_MASK: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src3 = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);
	MVT VT = Op.getSimpleValueType();
	SDValue PassThru = SDValue();

	// set PassThru element
	if (IntrData->Type == FMA_OP_MASKZ)
	PassThru = getZeroVector(VT, Subtarget, DAG, dl);
	else if (IntrData->Type == FMA_OP_MASK3)
	PassThru = Src3;
	else
	PassThru = Src1;

	// We specify 2 possible opcodes for intrinsics with rounding modes.
	// First, we check if the intrinsic may have non-default rounding mode,
	// (IntrData->Opc1 != 0), then we check the rounding mode operand.
	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	if (IntrWithRoundingModeOpcode != 0) {
	SDValue Rnd = Op.getOperand(5);
	if (!isRoundModeCurDirection(Rnd))
	return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
	dl, Op.getValueType(),
	Src1, Src2, Src3, Rnd),
	Mask, PassThru, Subtarget, DAG);
	}
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
	dl, Op.getValueType(),
	Src1, Src2, Src3),
	Mask, PassThru, Subtarget, DAG);
	}
	case FMA_OP_SCALAR_MASK:
	case FMA_OP_SCALAR_MASK3:
	case FMA_OP_SCALAR_MASKZ: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src3 = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);
	MVT VT = Op.getSimpleValueType();
	SDValue PassThru = SDValue();

	// set PassThru element
	if (IntrData->Type == FMA_OP_SCALAR_MASKZ)
	PassThru = getZeroVector(VT, Subtarget, DAG, dl);
	else if (IntrData->Type == FMA_OP_SCALAR_MASK3)
	PassThru = Src3;
	else
	PassThru = Src1;

	SDValue Rnd = Op.getOperand(5);
	return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl,
	Op.getValueType(), Src1, Src2,
	Src3, Rnd),
	Mask, PassThru, Subtarget, DAG);
	}
	case TERLOG_OP_MASK:
	case TERLOG_OP_MASKZ: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src3 = Op.getOperand(3);
	SDValue Src4 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(4));
	SDValue Mask = Op.getOperand(5);
	MVT VT = Op.getSimpleValueType();
	SDValue PassThru = Src1;
	// Set PassThru element.
	if (IntrData->Type == TERLOG_OP_MASKZ)
	PassThru = getZeroVector(VT, Subtarget, DAG, dl);

	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
	Src1, Src2, Src3, Src4),
	Mask, PassThru, Subtarget, DAG);
	}
	case CVTPD2PS:
	// ISD::FP_ROUND has a second argument that indicates if the truncation
	// does not change the value. Set it to 0 since it can change.
	return DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),
	DAG.getIntPtrConstant(0, dl));
	case CVTPD2PS_MASK: {
	SDValue Src = Op.getOperand(1);
	SDValue PassThru = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);
	// We add rounding mode to the Node when
	// - RM Opcode is specified and
	// - RM is not "current direction".
	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	if (IntrWithRoundingModeOpcode != 0) {
	SDValue Rnd = Op.getOperand(4);
	if (!isRoundModeCurDirection(Rnd)) {
	return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
	dl, Op.getValueType(),
	Src, Rnd),
	Mask, PassThru, Subtarget, DAG);
	}
	}
	assert(IntrData->Opc0 == ISD::FP_ROUND && "Unexpected opcode!");
	// ISD::FP_ROUND has a second argument that indicates if the truncation
	// does not change the value. Set it to 0 since it can change.
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
	DAG.getIntPtrConstant(0, dl)),
	Mask, PassThru, Subtarget, DAG);
	}
	case FPCLASS: {
	// FPclass intrinsics with mask
	SDValue Src1 = Op.getOperand(1);
	MVT VT = Src1.getSimpleValueType();
	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
	SDValue Imm = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);
	MVT BitcastVT = MVT::getVectorVT(MVT::i1,
	Mask.getSimpleValueType().getSizeInBits());
	SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
	SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask,
	DAG.getTargetConstant(0, dl, MaskVT),
	Subtarget, DAG);
	SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
	DAG.getUNDEF(BitcastVT), FPclassMask,
	DAG.getIntPtrConstant(0, dl));
	return DAG.getBitcast(Op.getValueType(), Res);
	}
	case FPCLASSS: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Imm = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);
	SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
	SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask,
	DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG);
	return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i8, FPclassMask,
	DAG.getIntPtrConstant(0, dl));
	}
	case CMP_MASK:
	case CMP_MASK_CC: {
	// Comparison intrinsics with masks.
	// Example of transformation:
	// (i8 (int_x86_avx512_mask_pcmpeq_q_128
	// (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
	// (i8 (bitcast
	// (v8i1 (insert_subvector undef,
	// (v2i1 (and (PCMPEQM %a, %b),
	// (extract_subvector
	// (v8i1 (bitcast %mask)), 0))), 0))))
	MVT VT = Op.getOperand(1).getSimpleValueType();
	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
	SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
	MVT BitcastVT = MVT::getVectorVT(MVT::i1,
	Mask.getSimpleValueType().getSizeInBits());
	SDValue Cmp;
	if (IntrData->Type == CMP_MASK_CC) {
	SDValue CC = Op.getOperand(3);
	CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
	// We specify 2 possible opcodes for intrinsics with rounding modes.
	// First, we check if the intrinsic may have non-default rounding mode,
	// (IntrData->Opc1 != 0), then we check the rounding mode operand.
	if (IntrData->Opc1 != 0) {
	SDValue Rnd = Op.getOperand(5);
	if (!isRoundModeCurDirection(Rnd))
	Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
	Op.getOperand(2), CC, Rnd);
	}
	//default rounding mode
	if(!Cmp.getNode())
	Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
	Op.getOperand(2), CC);

	} else {
	assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
	Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
	Op.getOperand(2));
	}
	SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,
	DAG.getTargetConstant(0, dl,
	MaskVT),
	Subtarget, DAG);
	SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
	DAG.getUNDEF(BitcastVT), CmpMask,
	DAG.getIntPtrConstant(0, dl));
	return DAG.getBitcast(Op.getValueType(), Res);
	}
	case CMP_MASK_SCALAR_CC: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
	SDValue Mask = Op.getOperand(4);

	SDValue Cmp;
	if (IntrData->Opc1 != 0) {
	SDValue Rnd = Op.getOperand(5);
	if (!isRoundModeCurDirection(Rnd))
	Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Rnd);
	}
	//default rounding mode
	if(!Cmp.getNode())
	Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);

	SDValue CmpMask = getScalarMaskingNode(Cmp, Mask,
	DAG.getTargetConstant(0, dl,
	MVT::i1),
	Subtarget, DAG);
	return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i8, CmpMask,
	DAG.getIntPtrConstant(0, dl));
	}
	case COMI: { // Comparison intrinsics
	ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
	SDValue LHS = Op.getOperand(1);
	SDValue RHS = Op.getOperand(2);
	SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
	SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);
	SDValue SetCC;
	switch (CC) {
	case ISD::SETEQ: { // (ZF = 0 and PF = 0)
	SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
	SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
	SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
	break;
	}
	case ISD::SETNE: { // (ZF = 1 or PF = 1)
	SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
	SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
	SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
	break;
	}
	case ISD::SETGT: // (CF = 0 and ZF = 0)
	SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
	break;
	case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
	SetCC = getSETCC(X86::COND_A, InvComi, dl, DAG);
	break;
	}
	case ISD::SETGE: // CF = 0
	SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
	break;
	case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
	SetCC = getSETCC(X86::COND_AE, InvComi, dl, DAG);
	break;
	default:
	llvm_unreachable("Unexpected illegal condition!");
	}
	return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
	}
	case COMI_RM: { // Comparison intrinsics with Sae
	SDValue LHS = Op.getOperand(1);
	SDValue RHS = Op.getOperand(2);
	unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
	SDValue Sae = Op.getOperand(4);

	SDValue FCmp;
	if (isRoundModeCurDirection(Sae))
	FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
	DAG.getConstant(CondVal, dl, MVT::i8));
	else
	FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::v1i1, LHS, RHS,
	DAG.getConstant(CondVal, dl, MVT::i8), Sae);
	return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i32, FCmp,
	DAG.getIntPtrConstant(0, dl));
	}
	case VSHIFT:
	return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
	Op.getOperand(1), Op.getOperand(2), Subtarget,
	DAG);
	case COMPRESS_EXPAND_IN_REG: {
	SDValue Mask = Op.getOperand(3);
	SDValue DataToCompress = Op.getOperand(1);
	SDValue PassThru = Op.getOperand(2);
	if (isAllOnesConstant(Mask)) // return data as is
	return Op.getOperand(1);

	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
	DataToCompress),
	Mask, PassThru, Subtarget, DAG);
	}
	case BROADCASTM: {
	SDValue Mask = Op.getOperand(1);
	MVT MaskVT = MVT::getVectorVT(MVT::i1,
	Mask.getSimpleValueType().getSizeInBits());
	Mask = DAG.getBitcast(MaskVT, Mask);
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask);
	}
	case KUNPCK: {
	MVT VT = Op.getSimpleValueType();
	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2);

	SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
	SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
	// Arguments should be swapped.
	SDValue Res = DAG.getNode(IntrData->Opc0, dl,
	MVT::getVectorVT(MVT::i1, VT.getSizeInBits()),
	Src2, Src1);
	return DAG.getBitcast(VT, Res);
	}
	case MASK_BINOP: {
	MVT VT = Op.getSimpleValueType();
	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());

	SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
	SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
	SDValue Res = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Src2);
	return DAG.getBitcast(VT, Res);
	}
	case FIXUPIMMS:
	case FIXUPIMMS_MASKZ:
	case FIXUPIMM:
	case FIXUPIMM_MASKZ:{
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src3 = Op.getOperand(3);
	SDValue Imm = Op.getOperand(4);
	SDValue Mask = Op.getOperand(5);
	SDValue Passthru = (IntrData->Type == FIXUPIMM \|\| IntrData->Type == FIXUPIMMS ) ?
	Src1 : getZeroVector(VT, Subtarget, DAG, dl);
	// We specify 2 possible modes for intrinsics, with/without rounding
	// modes.
	// First, we check if the intrinsic have rounding mode (7 operands),
	// if not, we set rounding mode to "current".
	SDValue Rnd;
	if (Op.getNumOperands() == 7)
	Rnd = Op.getOperand(6);
	else
	Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
	if (IntrData->Type == FIXUPIMM \|\| IntrData->Type == FIXUPIMM_MASKZ)
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
	Src1, Src2, Src3, Imm, Rnd),
	Mask, Passthru, Subtarget, DAG);
	else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ
	return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
	Src1, Src2, Src3, Imm, Rnd),
	Mask, Passthru, Subtarget, DAG);
	}
	case CONVERT_TO_MASK: {
	MVT SrcVT = Op.getOperand(1).getSimpleValueType();
	MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
	MVT BitcastVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());

	SDValue CvtMask = DAG.getNode(IntrData->Opc0, dl, MaskVT,
	Op.getOperand(1));
	SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
	DAG.getUNDEF(BitcastVT), CvtMask,
	DAG.getIntPtrConstant(0, dl));
	return DAG.getBitcast(Op.getValueType(), Res);
	}
	case BRCST_SUBVEC_TO_VEC: {
	SDValue Src = Op.getOperand(1);
	SDValue Passthru = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);
	EVT resVT = Passthru.getValueType();
	SDValue subVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, resVT,
	DAG.getUNDEF(resVT), Src,
	DAG.getIntPtrConstant(0, dl));
	SDValue immVal;
	if (Src.getSimpleValueType().is256BitVector() && resVT.is512BitVector())
	immVal = DAG.getConstant(0x44, dl, MVT::i8);
	else
	immVal = DAG.getConstant(0, dl, MVT::i8);
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
	subVec, subVec, immVal),
	Mask, Passthru, Subtarget, DAG);
	}
	case BRCST32x2_TO_VEC: {
	SDValue Src = Op.getOperand(1);
	SDValue PassThru = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);

	assert((VT.getScalarType() == MVT::i32 \|\|
	VT.getScalarType() == MVT::f32) && "Unexpected type!");
	//bitcast Src to packed 64
	MVT ScalarVT = VT.getScalarType() == MVT::i32 ? MVT::i64 : MVT::f64;
	MVT BitcastVT = MVT::getVectorVT(ScalarVT, Src.getValueSizeInBits()/64);
	Src = DAG.getBitcast(BitcastVT, Src);

	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
	Mask, PassThru, Subtarget, DAG);
	}
	default:
	break;
	}
	}

	switch (IntNo) {
	default: return SDValue(); // Don't custom lower most intrinsics.

	case Intrinsic::x86_avx2_permd:
	case Intrinsic::x86_avx2_permps:
	// Operands intentionally swapped. Mask is last operand to intrinsic,
	// but second operand for node/instruction.
	return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
	Op.getOperand(2), Op.getOperand(1));

	// ptest and testp intrinsics. The intrinsic these come from are designed to
	// return an integer value, not just an instruction so lower it to the ptest
	// or testp pattern and a setcc for the result.
	case Intrinsic::x86_sse41_ptestz:
	case Intrinsic::x86_sse41_ptestc:
	case Intrinsic::x86_sse41_ptestnzc:
	case Intrinsic::x86_avx_ptestz_256:
	case Intrinsic::x86_avx_ptestc_256:
	case Intrinsic::x86_avx_ptestnzc_256:
	case Intrinsic::x86_avx_vtestz_ps:
	case Intrinsic::x86_avx_vtestc_ps:
	case Intrinsic::x86_avx_vtestnzc_ps:
	case Intrinsic::x86_avx_vtestz_pd:
	case Intrinsic::x86_avx_vtestc_pd:
	case Intrinsic::x86_avx_vtestnzc_pd:
	case Intrinsic::x86_avx_vtestz_ps_256:
	case Intrinsic::x86_avx_vtestc_ps_256:
	case Intrinsic::x86_avx_vtestnzc_ps_256:
	case Intrinsic::x86_avx_vtestz_pd_256:
	case Intrinsic::x86_avx_vtestc_pd_256:
	case Intrinsic::x86_avx_vtestnzc_pd_256: {
	bool IsTestPacked = false;
	X86::CondCode X86CC;
	switch (IntNo) {
	default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
	case Intrinsic::x86_avx_vtestz_ps:
	case Intrinsic::x86_avx_vtestz_pd:
	case Intrinsic::x86_avx_vtestz_ps_256:
	case Intrinsic::x86_avx_vtestz_pd_256:
	IsTestPacked = true;
	LLVM_FALLTHROUGH;
	case Intrinsic::x86_sse41_ptestz:
	case Intrinsic::x86_avx_ptestz_256:
	// ZF = 1
	X86CC = X86::COND_E;
	break;
	case Intrinsic::x86_avx_vtestc_ps:
	case Intrinsic::x86_avx_vtestc_pd:
	case Intrinsic::x86_avx_vtestc_ps_256:
	case Intrinsic::x86_avx_vtestc_pd_256:
	IsTestPacked = true;
	LLVM_FALLTHROUGH;
	case Intrinsic::x86_sse41_ptestc:
	case Intrinsic::x86_avx_ptestc_256:
	// CF = 1
	X86CC = X86::COND_B;
	break;
	case Intrinsic::x86_avx_vtestnzc_ps:
	case Intrinsic::x86_avx_vtestnzc_pd:
	case Intrinsic::x86_avx_vtestnzc_ps_256:
	case Intrinsic::x86_avx_vtestnzc_pd_256:
	IsTestPacked = true;
	LLVM_FALLTHROUGH;
	case Intrinsic::x86_sse41_ptestnzc:
	case Intrinsic::x86_avx_ptestnzc_256:
	// ZF and CF = 0
	X86CC = X86::COND_A;
	break;
	}

	SDValue LHS = Op.getOperand(1);
	SDValue RHS = Op.getOperand(2);
	unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
	SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
	SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
	return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
	}
	case Intrinsic::x86_avx512_kortestz_w:
	case Intrinsic::x86_avx512_kortestc_w: {
	X86::CondCode X86CC =
	(IntNo == Intrinsic::x86_avx512_kortestz_w) ? X86::COND_E : X86::COND_B;
	SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
	SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
	SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
	SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
	return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
	}

	case Intrinsic::x86_avx512_knot_w: {
	SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
	SDValue RHS = DAG.getConstant(1, dl, MVT::v16i1);
	SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
	return DAG.getBitcast(MVT::i16, Res);
	}

	case Intrinsic::x86_avx512_kandn_w: {
	SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
	// Invert LHS for the not.
	LHS = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS,
	DAG.getConstant(1, dl, MVT::v16i1));
	SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
	SDValue Res = DAG.getNode(ISD::AND, dl, MVT::v16i1, LHS, RHS);
	return DAG.getBitcast(MVT::i16, Res);
	}

	case Intrinsic::x86_avx512_kxnor_w: {
	SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
	SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
	SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
	// Invert result for the not.
	Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, Res,
	DAG.getConstant(1, dl, MVT::v16i1));
	return DAG.getBitcast(MVT::i16, Res);
	}

	case Intrinsic::x86_sse42_pcmpistria128:
	case Intrinsic::x86_sse42_pcmpestria128:
	case Intrinsic::x86_sse42_pcmpistric128:
	case Intrinsic::x86_sse42_pcmpestric128:
	case Intrinsic::x86_sse42_pcmpistrio128:
	case Intrinsic::x86_sse42_pcmpestrio128:
	case Intrinsic::x86_sse42_pcmpistris128:
	case Intrinsic::x86_sse42_pcmpestris128:
	case Intrinsic::x86_sse42_pcmpistriz128:
	case Intrinsic::x86_sse42_pcmpestriz128: {
	unsigned Opcode;
	X86::CondCode X86CC;
	switch (IntNo) {
	default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
	case Intrinsic::x86_sse42_pcmpistria128:
	Opcode = X86ISD::PCMPISTRI;
	X86CC = X86::COND_A;
	break;
	case Intrinsic::x86_sse42_pcmpestria128:
	Opcode = X86ISD::PCMPESTRI;
	X86CC = X86::COND_A;
	break;
	case Intrinsic::x86_sse42_pcmpistric128:
	Opcode = X86ISD::PCMPISTRI;
	X86CC = X86::COND_B;
	break;
	case Intrinsic::x86_sse42_pcmpestric128:
	Opcode = X86ISD::PCMPESTRI;
	X86CC = X86::COND_B;
	break;
	case Intrinsic::x86_sse42_pcmpistrio128:
	Opcode = X86ISD::PCMPISTRI;
	X86CC = X86::COND_O;
	break;
	case Intrinsic::x86_sse42_pcmpestrio128:
	Opcode = X86ISD::PCMPESTRI;
	X86CC = X86::COND_O;
	break;
	case Intrinsic::x86_sse42_pcmpistris128:
	Opcode = X86ISD::PCMPISTRI;
	X86CC = X86::COND_S;
	break;
	case Intrinsic::x86_sse42_pcmpestris128:
	Opcode = X86ISD::PCMPESTRI;
	X86CC = X86::COND_S;
	break;
	case Intrinsic::x86_sse42_pcmpistriz128:
	Opcode = X86ISD::PCMPISTRI;
	X86CC = X86::COND_E;
	break;
	case Intrinsic::x86_sse42_pcmpestriz128:
	Opcode = X86ISD::PCMPESTRI;
	X86CC = X86::COND_E;
	break;
	}
	SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
	SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
	SDValue SetCC = getSETCC(X86CC, SDValue(PCMP.getNode(), 1), dl, DAG);
	return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
	}

	case Intrinsic::x86_sse42_pcmpistri128:
	case Intrinsic::x86_sse42_pcmpestri128: {
	unsigned Opcode;
	if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
	Opcode = X86ISD::PCMPISTRI;
	else
	Opcode = X86ISD::PCMPESTRI;

	SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
	return DAG.getNode(Opcode, dl, VTs, NewOps);
	}

	case Intrinsic::eh_sjlj_lsda: {
	MachineFunction &MF = DAG.getMachineFunction();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
	auto &Context = MF.getMMI().getContext();
	MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
	Twine(MF.getFunctionNumber()));
	return DAG.getNode(X86ISD::Wrapper, dl, VT, DAG.getMCSymbol(S, PtrVT));
	}

	case Intrinsic::x86_seh_lsda: {
	// Compute the symbol for the LSDA. We know it'll get emitted later.
	MachineFunction &MF = DAG.getMachineFunction();
	SDValue Op1 = Op.getOperand(1);
	auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
	MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
	GlobalValue::dropLLVMManglingEscape(Fn->getName()));

	// Generate a simple absolute symbol reference. This intrinsic is only
	// supported on 32-bit Windows, which isn't PIC.
	SDValue Result = DAG.getMCSymbol(LSDASym, VT);
	return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
	}

	case Intrinsic::x86_seh_recoverfp: {
	SDValue FnOp = Op.getOperand(1);
	SDValue IncomingFPOp = Op.getOperand(2);
	GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
	auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
	if (!Fn)
	report_fatal_error(
	"llvm.x86.seh.recoverfp must take a function as the first argument");
	return recoverFramePointer(DAG, Fn, IncomingFPOp);
	}

	case Intrinsic::localaddress: {
	// Returns one of the stack, base, or frame pointer registers, depending on
	// which is used to reference local variables.
	MachineFunction &MF = DAG.getMachineFunction();
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	unsigned Reg;
	if (RegInfo->hasBasePointer(MF))
	Reg = RegInfo->getBaseRegister();
	else // This function handles the SP or FP case.
	Reg = RegInfo->getPtrSizedFrameRegister(MF);
	return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
	}
	}
	}

	static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
	SDValue Src, SDValue Mask, SDValue Base,
	SDValue Index, SDValue ScaleOp, SDValue Chain,
	const X86Subtarget &Subtarget) {
	SDLoc dl(Op);
	auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
	// Scale must be constant.
	if (!C)
	return SDValue();
	SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
	EVT MaskVT = Mask.getValueType();
	SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
	SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
	SDValue Segment = DAG.getRegister(0, MVT::i32);
	// If source is undef or we know it won't be used, use a zero vector
	// to break register dependency.
	// TODO: use undef instead and let ExecutionDepsFix deal with it?
	if (Src.isUndef() \|\| ISD::isBuildVectorAllOnes(Mask.getNode()))
	Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
	SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain};
	SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
	SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
	return DAG.getMergeValues(RetOps, dl);
	}

	static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
	SDValue Src, SDValue Mask, SDValue Base,
	SDValue Index, SDValue ScaleOp, SDValue Chain,
	const X86Subtarget &Subtarget) {
	SDLoc dl(Op);
	auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
	// Scale must be constant.
	if (!C)
	return SDValue();
	SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
	MVT MaskVT = MVT::getVectorVT(MVT::i1,
	Index.getSimpleValueType().getVectorNumElements());

	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
	SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
	SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
	SDValue Segment = DAG.getRegister(0, MVT::i32);
	// If source is undef or we know it won't be used, use a zero vector
	// to break register dependency.
	// TODO: use undef instead and let ExecutionDepsFix deal with it?
	if (Src.isUndef() \|\| ISD::isBuildVectorAllOnes(VMask.getNode()))
	Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
	SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
	SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
	SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
	return DAG.getMergeValues(RetOps, dl);
	}

	static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
	SDValue Src, SDValue Mask, SDValue Base,
	SDValue Index, SDValue ScaleOp, SDValue Chain,
	const X86Subtarget &Subtarget) {
	SDLoc dl(Op);
	auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
	// Scale must be constant.
	if (!C)
	return SDValue();
	SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
	SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
	SDValue Segment = DAG.getRegister(0, MVT::i32);
	MVT MaskVT = MVT::getVectorVT(MVT::i1,
	Index.getSimpleValueType().getVectorNumElements());

	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
	SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
	SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain};
	SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
	return SDValue(Res, 1);
	}

	static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
	SDValue Mask, SDValue Base, SDValue Index,
	SDValue ScaleOp, SDValue Chain,
	const X86Subtarget &Subtarget) {
	SDLoc dl(Op);
	auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
	// Scale must be constant.
	if (!C)
	return SDValue();
	SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
	SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
	SDValue Segment = DAG.getRegister(0, MVT::i32);
	MVT MaskVT =
	MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
	SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
	SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
	return SDValue(Res, 0);
	}

	/// Handles the lowering of builtin intrinsic that return the value
	/// of the extended control register.
	static void getExtendedControlRegister(SDNode *N, const SDLoc &DL,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	SmallVectorImpl<SDValue> &Results) {
	assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
	SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
	SDValue LO, HI;

	// The ECX register is used to select the index of the XCR register to
	// return.
	SDValue Chain =
	DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, N->getOperand(2));
	SDNode *N1 = DAG.getMachineNode(X86::XGETBV, DL, Tys, Chain);
	Chain = SDValue(N1, 0);

	// Reads the content of XCR and returns it in registers EDX:EAX.
	if (Subtarget.is64Bit()) {
	LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
	HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
	LO.getValue(2));
	} else {
	LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
	HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
	LO.getValue(2));
	}
	Chain = HI.getValue(1);

	if (Subtarget.is64Bit()) {
	// Merge the two 32-bit values into a 64-bit one..
	SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
	DAG.getConstant(32, DL, MVT::i8));
	Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
	Results.push_back(Chain);
	return;
	}

	// Use a buildpair to merge the two 32-bit values into a 64-bit one.
	SDValue Ops[] = { LO, HI };
	SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
	Results.push_back(Pair);
	Results.push_back(Chain);
	}

	/// Handles the lowering of builtin intrinsics that read performance monitor
	/// counters (x86_rdpmc).
	static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	SmallVectorImpl<SDValue> &Results) {
	assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
	SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
	SDValue LO, HI;

	// The ECX register is used to select the index of the performance counter
	// to read.
	SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
	N->getOperand(2));
	SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);

	// Reads the content of a 64-bit performance counter and returns it in the
	// registers EDX:EAX.
	if (Subtarget.is64Bit()) {
	LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
	HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
	LO.getValue(2));
	} else {
	LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
	HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
	LO.getValue(2));
	}
	Chain = HI.getValue(1);

	if (Subtarget.is64Bit()) {
	// The EAX register is loaded with the low-order 32 bits. The EDX register
	// is loaded with the supported high-order bits of the counter.
	SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
	DAG.getConstant(32, DL, MVT::i8));
	Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
	Results.push_back(Chain);
	return;
	}

	// Use a buildpair to merge the two 32-bit values into a 64-bit one.
	SDValue Ops[] = { LO, HI };
	SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
	Results.push_back(Pair);
	Results.push_back(Chain);
	}

	/// Handles the lowering of builtin intrinsics that read the time stamp counter
	/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
	/// READCYCLECOUNTER nodes.
	static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	SmallVectorImpl<SDValue> &Results) {
	SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
	SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
	SDValue LO, HI;

	// The processor's time-stamp counter (a 64-bit MSR) is stored into the
	// EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
	// and the EAX register is loaded with the low-order 32 bits.
	if (Subtarget.is64Bit()) {
	LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
	HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
	LO.getValue(2));
	} else {
	LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
	HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
	LO.getValue(2));
	}
	SDValue Chain = HI.getValue(1);

	if (Opcode == X86ISD::RDTSCP_DAG) {
	assert(N->getNumOperands() == 3 && "Unexpected number of operands!");

	// Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
	// the ECX register. Add 'ecx' explicitly to the chain.
	SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
	HI.getValue(2));
	// Explicitly store the content of ECX at the location passed in input
	// to the 'rdtscp' intrinsic.
	Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
	MachinePointerInfo());
	}

	if (Subtarget.is64Bit()) {
	// The EDX register is loaded with the high-order 32 bits of the MSR, and
	// the EAX register is loaded with the low-order 32 bits.
	SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
	DAG.getConstant(32, DL, MVT::i8));
	Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
	Results.push_back(Chain);
	return;
	}

	// Use a buildpair to merge the two 32-bit values into a 64-bit one.
	SDValue Ops[] = { LO, HI };
	SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
	Results.push_back(Pair);
	Results.push_back(Chain);
	}

	static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SmallVector<SDValue, 2> Results;
	SDLoc DL(Op);
	getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
	Results);
	return DAG.getMergeValues(Results, DL);
	}

	static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
	MachineFunction &MF = DAG.getMachineFunction();
	SDValue Chain = Op.getOperand(0);
	SDValue RegNode = Op.getOperand(2);
	WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
	if (!EHInfo)
	report_fatal_error("EH registrations only live in functions using WinEH");

	// Cast the operand to an alloca, and remember the frame index.
	auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
	if (!FINode)
	report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
	EHInfo->EHRegNodeFrameIndex = FINode->getIndex();

	// Return the chain operand without making any DAG nodes.
	return Chain;
	}

	static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
	MachineFunction &MF = DAG.getMachineFunction();
	SDValue Chain = Op.getOperand(0);
	SDValue EHGuard = Op.getOperand(2);
	WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
	if (!EHInfo)
	report_fatal_error("EHGuard only live in functions using WinEH");

	// Cast the operand to an alloca, and remember the frame index.
	auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
	if (!FINode)
	report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
	EHInfo->EHGuardFrameIndex = FINode->getIndex();

	// Return the chain operand without making any DAG nodes.
	return Chain;
	}

	/// Emit Truncating Store with signed or unsigned saturation.
	static SDValue
	EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
	SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
	SelectionDAG &DAG) {

	SDVTList VTs = DAG.getVTList(MVT::Other);
	SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
	SDValue Ops[] = { Chain, Val, Ptr, Undef };
	return SignedSat ?
	DAG.getTargetMemSDNode<TruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
	DAG.getTargetMemSDNode<TruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
	}

	/// Emit Masked Truncating Store with signed or unsigned saturation.
	static SDValue
	EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
	SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
	MachineMemOperand *MMO, SelectionDAG &DAG) {

	SDVTList VTs = DAG.getVTList(MVT::Other);
	SDValue Ops[] = { Chain, Ptr, Mask, Val };
	return SignedSat ?
	DAG.getTargetMemSDNode<MaskedTruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
	DAG.getTargetMemSDNode<MaskedTruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
	}

	static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();

	const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
	if (!IntrData) {
	switch (IntNo) {
	case llvm::Intrinsic::x86_seh_ehregnode:
	return MarkEHRegistrationNode(Op, DAG);
	case llvm::Intrinsic::x86_seh_ehguard:
	return MarkEHGuard(Op, DAG);
	case llvm::Intrinsic::x86_flags_read_u32:
	case llvm::Intrinsic::x86_flags_read_u64:
	case llvm::Intrinsic::x86_flags_write_u32:
	case llvm::Intrinsic::x86_flags_write_u64: {
	// We need a frame pointer because this will get lowered to a PUSH/POP
	// sequence.
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	MFI.setHasCopyImplyingStackAdjustment(true);
	// Don't do anything here, we will expand these intrinsics out later
	// during ExpandISelPseudos in EmitInstrWithCustomInserter.
	return SDValue();
	}
	case Intrinsic::x86_lwpins32:
	case Intrinsic::x86_lwpins64: {
	SDLoc dl(Op);
	SDValue Chain = Op->getOperand(0);
	SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
	SDValue LwpIns =
	DAG.getNode(X86ISD::LWPINS, dl, VTs, Chain, Op->getOperand(2),
	Op->getOperand(3), Op->getOperand(4));
	SDValue SetCC = getSETCC(X86::COND_B, LwpIns.getValue(0), dl, DAG);
	SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, SetCC);
	return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
	LwpIns.getValue(1));
	}
	}
	return SDValue();
	}

	SDLoc dl(Op);
	switch(IntrData->Type) {
	default: llvm_unreachable("Unknown Intrinsic Type");
	case RDSEED:
	case RDRAND: {
	// Emit the node with the right value type.
	SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
	SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));

	// If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
	// Otherwise return the value from Rand, which is always 0, casted to i32.
	SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
	DAG.getConstant(1, dl, Op->getValueType(1)),
	DAG.getConstant(X86::COND_B, dl, MVT::i32),
	SDValue(Result.getNode(), 1) };
	SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
	DAG.getVTList(Op->getValueType(1), MVT::Glue),
	Ops);

	// Return { result, isValid, chain }.
	return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
	SDValue(Result.getNode(), 2));
	}
	case GATHER_AVX2: {
	SDValue Chain = Op.getOperand(0);
	SDValue Src = Op.getOperand(2);
	SDValue Base = Op.getOperand(3);
	SDValue Index = Op.getOperand(4);
	SDValue Mask = Op.getOperand(5);
	SDValue Scale = Op.getOperand(6);
	return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
	Scale, Chain, Subtarget);
	}
	case GATHER: {
	//gather(v1, mask, index, base, scale);
	SDValue Chain = Op.getOperand(0);
	SDValue Src = Op.getOperand(2);
	SDValue Base = Op.getOperand(3);
	SDValue Index = Op.getOperand(4);
	SDValue Mask = Op.getOperand(5);
	SDValue Scale = Op.getOperand(6);
	return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale,
	Chain, Subtarget);
	}
	case SCATTER: {
	//scatter(base, mask, index, v1, scale);
	SDValue Chain = Op.getOperand(0);
	SDValue Base = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);
	SDValue Index = Op.getOperand(4);
	SDValue Src = Op.getOperand(5);
	SDValue Scale = Op.getOperand(6);
	return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
	Scale, Chain, Subtarget);
	}
	case PREFETCH: {
	SDValue Hint = Op.getOperand(6);
	unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
	assert((HintVal == 2 \|\| HintVal == 3) &&
	"Wrong prefetch hint in intrinsic: should be 2 or 3");
	unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
	SDValue Chain = Op.getOperand(0);
	SDValue Mask = Op.getOperand(2);
	SDValue Index = Op.getOperand(3);
	SDValue Base = Op.getOperand(4);
	SDValue Scale = Op.getOperand(5);
	return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
	Subtarget);
	}
	// Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
	case RDTSC: {
	SmallVector<SDValue, 2> Results;
	getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
	Results);
	return DAG.getMergeValues(Results, dl);
	}
	// Read Performance Monitoring Counters.
	case RDPMC: {
	SmallVector<SDValue, 2> Results;
	getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
	return DAG.getMergeValues(Results, dl);
	}
	// Get Extended Control Register.
	case XGETBV: {
	SmallVector<SDValue, 2> Results;
	getExtendedControlRegister(Op.getNode(), dl, DAG, Subtarget, Results);
	return DAG.getMergeValues(Results, dl);
	}
	// XTEST intrinsics.
	case XTEST: {
	SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
	SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));

	SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
	SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
	return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
	Ret, SDValue(InTrans.getNode(), 1));
	}
	// ADC/ADCX/SBB
	case ADX: {
	- SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
	- SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other);
	+ SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
	+ SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::i32);
	SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
	DAG.getConstant(-1, dl, MVT::i8));
	SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
	Op.getOperand(4), GenCF.getValue(1));
	SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
	Op.getOperand(5), MachinePointerInfo());
	SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
	SDValue Results[] = { SetCC, Store };
	return DAG.getMergeValues(Results, dl);
	}
	case COMPRESS_TO_MEM: {
	SDValue Mask = Op.getOperand(4);
	SDValue DataToCompress = Op.getOperand(3);
	SDValue Addr = Op.getOperand(2);
	SDValue Chain = Op.getOperand(0);
	MVT VT = DataToCompress.getSimpleValueType();

	MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
	assert(MemIntr && "Expected MemIntrinsicSDNode!");

	if (isAllOnesConstant(Mask)) // return just a store
	return DAG.getStore(Chain, dl, DataToCompress, Addr,
	MemIntr->getMemOperand());

	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

	return DAG.getMaskedStore(Chain, dl, DataToCompress, Addr, VMask, VT,
	MemIntr->getMemOperand(),
	false /* truncating /, true / compressing */);
	}
	case TRUNCATE_TO_MEM_VI8:
	case TRUNCATE_TO_MEM_VI16:
	case TRUNCATE_TO_MEM_VI32: {
	SDValue Mask = Op.getOperand(4);
	SDValue DataToTruncate = Op.getOperand(3);
	SDValue Addr = Op.getOperand(2);
	SDValue Chain = Op.getOperand(0);

	MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
	assert(MemIntr && "Expected MemIntrinsicSDNode!");

	EVT MemVT = MemIntr->getMemoryVT();

	uint16_t TruncationOp = IntrData->Opc0;
	switch (TruncationOp) {
	case X86ISD::VTRUNC: {
	if (isAllOnesConstant(Mask)) // return just a truncate store
	return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
	MemIntr->getMemOperand());

	MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

	return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT,
	MemIntr->getMemOperand(), true /* truncating */);
	}
	case X86ISD::VTRUNCUS:
	case X86ISD::VTRUNCS: {
	bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
	if (isAllOnesConstant(Mask))
	return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
	MemIntr->getMemOperand(), DAG);

	MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

	return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
	VMask, MemVT, MemIntr->getMemOperand(), DAG);
	}
	default:
	llvm_unreachable("Unsupported truncstore intrinsic");
	}
	}

	case EXPAND_FROM_MEM: {
	SDValue Mask = Op.getOperand(4);
	SDValue PassThru = Op.getOperand(3);
	SDValue Addr = Op.getOperand(2);
	SDValue Chain = Op.getOperand(0);
	MVT VT = Op.getSimpleValueType();

	MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
	assert(MemIntr && "Expected MemIntrinsicSDNode!");

	if (isAllOnesConstant(Mask)) // Return a regular (unmasked) vector load.
	return DAG.getLoad(VT, dl, Chain, Addr, MemIntr->getMemOperand());
	if (X86::isZeroNode(Mask))
	return DAG.getUNDEF(VT);

	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
	return DAG.getMaskedLoad(VT, dl, Chain, Addr, VMask, PassThru, VT,
	MemIntr->getMemOperand(), ISD::NON_EXTLOAD,
	true /* expanding */);
	}
	}
	}

	SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
	SelectionDAG &DAG) const {
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	MFI.setReturnAddressIsTaken(true);

	if (verifyReturnAddressArgumentIsConstant(Op, DAG))
	return SDValue();

	unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	SDLoc dl(Op);
	EVT PtrVT = getPointerTy(DAG.getDataLayout());

	if (Depth > 0) {
	SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
	return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
	DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
	MachinePointerInfo());
	}

	// Just load the return address.
	SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
	return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
	MachinePointerInfo());
	}

	SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
	SelectionDAG &DAG) const {
	DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
	return getReturnAddressFrameIndex(DAG);
	}

	SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	EVT VT = Op.getValueType();

	MFI.setFrameAddressIsTaken(true);

	if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
	// Depth > 0 makes no sense on targets which use Windows unwind codes. It
	// is not possible to crawl up the stack without looking at the unwind codes
	// simultaneously.
	int FrameAddrIndex = FuncInfo->getFAIndex();
	if (!FrameAddrIndex) {
	// Set up a frame object for the return address.
	unsigned SlotSize = RegInfo->getSlotSize();
	FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
	SlotSize, /Offset=/0, /IsImmutable=/false);
	FuncInfo->setFAIndex(FrameAddrIndex);
	}
	return DAG.getFrameIndex(FrameAddrIndex, VT);
	}

	unsigned FrameReg =
	RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
	SDLoc dl(Op); // FIXME probably not meaningful
	unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	assert(((FrameReg == X86::RBP && VT == MVT::i64) \|\|
	(FrameReg == X86::EBP && VT == MVT::i32)) &&
	"Invalid Frame Register!");
	SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
	while (Depth--)
	FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
	MachinePointerInfo());
	return FrameAddr;
	}

	// FIXME? Maybe this could be a TableGen attribute on some registers and
	// this table could be generated automatically from RegInfo.
	unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
	SelectionDAG &DAG) const {
	const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
	const MachineFunction &MF = DAG.getMachineFunction();

	unsigned Reg = StringSwitch<unsigned>(RegName)
	.Case("esp", X86::ESP)
	.Case("rsp", X86::RSP)
	.Case("ebp", X86::EBP)
	.Case("rbp", X86::RBP)
	.Default(0);

	if (Reg == X86::EBP \|\| Reg == X86::RBP) {
	if (!TFI.hasFP(MF))
	report_fatal_error("register " + StringRef(RegName) +
	" is allocatable: function has no frame pointer");
	#ifndef NDEBUG
	else {
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	unsigned FrameReg =
	RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
	assert((FrameReg == X86::EBP \|\| FrameReg == X86::RBP) &&
	"Invalid Frame Register!");
	}
	#endif
	}

	if (Reg)
	return Reg;

	report_fatal_error("Invalid register name global variable");
	}

	SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
	SelectionDAG &DAG) const {
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
	}

	unsigned X86TargetLowering::getExceptionPointerRegister(
	const Constant *PersonalityFn) const {
	if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
	return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;

	return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
	}

	unsigned X86TargetLowering::getExceptionSelectorRegister(
	const Constant *PersonalityFn) const {
	// Funclet personalities don't use selectors (the runtime does the selection).
	assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
	return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
	}

	bool X86TargetLowering::needsFixedCatchObjects() const {
	return Subtarget.isTargetWin64();
	}

	SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
	SDValue Chain = Op.getOperand(0);
	SDValue Offset = Op.getOperand(1);
	SDValue Handler = Op.getOperand(2);
	SDLoc dl (Op);

	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
	assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) \|\|
	(FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
	"Invalid Frame Register!");
	SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
	unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;

	SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
	DAG.getIntPtrConstant(RegInfo->getSlotSize(),
	dl));
	StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
	Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
	Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);

	return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
	DAG.getRegister(StoreAddrReg, PtrVT));
	}

	SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc DL(Op);
	// If the subtarget is not 64bit, we may need the global base reg
	// after isel expand pseudo, i.e., after CGBR pass ran.
	// Therefore, ask for the GlobalBaseReg now, so that the pass
	// inserts the code for us in case we need it.
	// Otherwise, we will end up in a situation where we will
	// reference a virtual register that is not defined!
	if (!Subtarget.is64Bit()) {
	const X86InstrInfo *TII = Subtarget.getInstrInfo();
	(void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
	}
	return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
	DAG.getVTList(MVT::i32, MVT::Other),
	Op.getOperand(0), Op.getOperand(1));
	}

	SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc DL(Op);
	return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
	Op.getOperand(0), Op.getOperand(1));
	}

	SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc DL(Op);
	return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
	Op.getOperand(0));
	}

	static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
	return Op.getOperand(0);
	}

	SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
	SelectionDAG &DAG) const {
	SDValue Root = Op.getOperand(0);
	SDValue Trmp = Op.getOperand(1); // trampoline
	SDValue FPtr = Op.getOperand(2); // nested function
	SDValue Nest = Op.getOperand(3); // 'nest' parameter value
	SDLoc dl (Op);

	const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

	if (Subtarget.is64Bit()) {
	SDValue OutChains[6];

	// Large code-model.
	const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
	const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.

	const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
	const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;

	const unsigned char REX_WB = 0x40 \| 0x08 \| 0x01; // REX prefix

	// Load the pointer to the nested function into R11.
	unsigned OpCode = ((MOV64ri \| N86R11) << 8) \| REX_WB; // movabsq r11
	SDValue Addr = Trmp;
	OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
	Addr, MachinePointerInfo(TrmpAddr));

	Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
	DAG.getConstant(2, dl, MVT::i64));
	OutChains[1] =
	DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),
	/* Alignment = */ 2);

	// Load the 'nest' parameter value into R10.
	// R10 is specified in X86CallingConv.td
	OpCode = ((MOV64ri \| N86R10) << 8) \| REX_WB; // movabsq r10
	Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
	DAG.getConstant(10, dl, MVT::i64));
	OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
	Addr, MachinePointerInfo(TrmpAddr, 10));

	Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
	DAG.getConstant(12, dl, MVT::i64));
	OutChains[3] =
	DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),
	/* Alignment = */ 2);

	// Jump to the nested function.
	OpCode = (JMP64r << 8) \| REX_WB; // jmpq *...
	Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
	DAG.getConstant(20, dl, MVT::i64));
	OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
	Addr, MachinePointerInfo(TrmpAddr, 20));

	unsigned char ModRM = N86R11 \| (4 << 3) \| (3 << 6); // ...r11
	Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
	DAG.getConstant(22, dl, MVT::i64));
	OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
	Addr, MachinePointerInfo(TrmpAddr, 22));

	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
	} else {
	const Function *Func =
	cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
	CallingConv::ID CC = Func->getCallingConv();
	unsigned NestReg;

	switch (CC) {
	default:
	llvm_unreachable("Unsupported calling convention");
	case CallingConv::C:
	case CallingConv::X86_StdCall: {
	// Pass 'nest' parameter in ECX.
	// Must be kept in sync with X86CallingConv.td
	NestReg = X86::ECX;

	// Check that ECX wasn't needed by an 'inreg' parameter.
	FunctionType *FTy = Func->getFunctionType();
	const AttributeList &Attrs = Func->getAttributes();

	if (!Attrs.isEmpty() && !Func->isVarArg()) {
	unsigned InRegCount = 0;
	unsigned Idx = 1;

	for (FunctionType::param_iterator I = FTy->param_begin(),
	E = FTy->param_end(); I != E; ++I, ++Idx)
	if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
	auto &DL = DAG.getDataLayout();
	// FIXME: should only count parameters that are lowered to integers.
	InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
	}

	if (InRegCount > 2) {
	report_fatal_error("Nest register in use - reduce number of inreg"
	" parameters!");
	}
	}
	break;
	}
	case CallingConv::X86_FastCall:
	case CallingConv::X86_ThisCall:
	case CallingConv::Fast:
	// Pass 'nest' parameter in EAX.
	// Must be kept in sync with X86CallingConv.td
	NestReg = X86::EAX;
	break;
	}

	SDValue OutChains[4];
	SDValue Addr, Disp;

	Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
	DAG.getConstant(10, dl, MVT::i32));
	Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);

	// This is storing the opcode for MOV32ri.
	const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
	const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
	OutChains[0] =
	DAG.getStore(Root, dl, DAG.getConstant(MOV32ri \| N86Reg, dl, MVT::i8),
	Trmp, MachinePointerInfo(TrmpAddr));

	Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
	DAG.getConstant(1, dl, MVT::i32));
	OutChains[1] =
	DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),
	/* Alignment = */ 1);

	const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
	Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
	DAG.getConstant(5, dl, MVT::i32));
	OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
	Addr, MachinePointerInfo(TrmpAddr, 5),
	/* Alignment = */ 1);

	Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
	DAG.getConstant(6, dl, MVT::i32));
	OutChains[3] =
	DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),
	/* Alignment = */ 1);

	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
	}
	}

	SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
	SelectionDAG &DAG) const {
	/*
	The rounding mode is in bits 11:10 of FPSR, and has the following
	settings:
	00 Round to nearest
	01 Round to -inf
	10 Round to +inf
	11 Round to 0

	FLT_ROUNDS, on the other hand, expects the following:
	-1 Undefined
	0 Round to 0
	1 Round to nearest
	2 Round to +inf
	3 Round to -inf

	To perform the conversion, we do:
	(((((FPSR & 0x800) >> 11) \| ((FPSR & 0x400) >> 9)) + 1) & 3)
	*/

	MachineFunction &MF = DAG.getMachineFunction();
	const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
	unsigned StackAlignment = TFI.getStackAlignment();
	MVT VT = Op.getSimpleValueType();
	SDLoc DL(Op);

	// Save FP Control Word to stack slot
	int SSFI = MF.getFrameInfo().CreateStackObject(2, StackAlignment, false);
	SDValue StackSlot =
	DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));

	MachineMemOperand *MMO =
	MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
	MachineMemOperand::MOStore, 2, 2);

	SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
	SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
	DAG.getVTList(MVT::Other),
	Ops, MVT::i16, MMO);

	// Load FP Control Word from stack slot
	SDValue CWD =
	DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());

	// Transform as necessary
	SDValue CWD1 =
	DAG.getNode(ISD::SRL, DL, MVT::i16,
	DAG.getNode(ISD::AND, DL, MVT::i16,
	CWD, DAG.getConstant(0x800, DL, MVT::i16)),
	DAG.getConstant(11, DL, MVT::i8));
	SDValue CWD2 =
	DAG.getNode(ISD::SRL, DL, MVT::i16,
	DAG.getNode(ISD::AND, DL, MVT::i16,
	CWD, DAG.getConstant(0x400, DL, MVT::i16)),
	DAG.getConstant(9, DL, MVT::i8));

	SDValue RetVal =
	DAG.getNode(ISD::AND, DL, MVT::i16,
	DAG.getNode(ISD::ADD, DL, MVT::i16,
	DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
	DAG.getConstant(1, DL, MVT::i16)),
	DAG.getConstant(3, DL, MVT::i16));

	return DAG.getNode((VT.getSizeInBits() < 16 ?
	ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
	}

	// Split an unary integer op into 2 half sized ops.
	static SDValue LowerVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	unsigned NumElems = VT.getVectorNumElements();
	unsigned SizeInBits = VT.getSizeInBits();

	// Extract the Lo/Hi vectors
	SDLoc dl(Op);
	SDValue Src = Op.getOperand(0);
	SDValue Lo = extractSubVector(Src, 0, DAG, dl, SizeInBits / 2);
	SDValue Hi = extractSubVector(Src, NumElems / 2, DAG, dl, SizeInBits / 2);

	MVT EltVT = VT.getVectorElementType();
	MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2);
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
	DAG.getNode(Op.getOpcode(), dl, NewVT, Lo),
	DAG.getNode(Op.getOpcode(), dl, NewVT, Hi));
	}

	// Decompose 256-bit ops into smaller 128-bit ops.
	static SDValue Lower256IntUnary(SDValue Op, SelectionDAG &DAG) {
	assert(Op.getSimpleValueType().is256BitVector() &&
	Op.getSimpleValueType().isInteger() &&
	"Only handle AVX 256-bit vector integer operation");
	return LowerVectorIntUnary(Op, DAG);
	}

	// Decompose 512-bit ops into smaller 256-bit ops.
	static SDValue Lower512IntUnary(SDValue Op, SelectionDAG &DAG) {
	assert(Op.getSimpleValueType().is512BitVector() &&
	Op.getSimpleValueType().isInteger() &&
	"Only handle AVX 512-bit vector integer operation");
	return LowerVectorIntUnary(Op, DAG);
	}

	/// \brief Lower a vector CTLZ using native supported vector CTLZ instruction.
	//
	// i8/i16 vector implemented using dword LZCNT vector instruction
	// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
	// split the vector, perform operation on it's Lo a Hi part and
	// concatenate the results.
	static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG) {
	assert(Op.getOpcode() == ISD::CTLZ);
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	unsigned NumElems = VT.getVectorNumElements();

	assert((EltVT == MVT::i8 \|\| EltVT == MVT::i16) &&
	"Unsupported element type");

	// Split vector, it's Lo and Hi parts will be handled in next iteration.
	if (16 < NumElems)
	return LowerVectorIntUnary(Op, DAG);

	MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
	assert((NewVT.is256BitVector() \|\| NewVT.is512BitVector()) &&
	"Unsupported value type for operation");

	// Use native supported vector instruction vplzcntd.
	Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
	SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
	SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
	SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);

	return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
	}

	// Lower CTLZ using a PSHUFB lookup table implementation.
	static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	int NumElts = VT.getVectorNumElements();
	int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
	MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);

	// Per-nibble leading zero PSHUFB lookup table.
	const int LUT[16] = {/* 0 / 4, / 1 / 3, / 2 / 2, / 3 */ 2,
	/* 4 / 1, / 5 / 1, / 6 / 1, / 7 */ 1,
	/* 8 / 0, / 9 / 0, / a / 0, / b */ 0,
	/* c / 0, / d / 0, / e / 0, / f */ 0};

	SmallVector<SDValue, 64> LUTVec;
	for (int i = 0; i < NumBytes; ++i)
	LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
	SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);

	// Begin by bitcasting the input to byte vector, then split those bytes
	// into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
	// If the hi input nibble is zero then we add both results together, otherwise
	// we just take the hi result (by masking the lo result to zero before the
	// add).
	SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
	SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL);

	SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT);
	SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
	SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
	SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
	SDValue HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);

	Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
	Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
	Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
	SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);

	// Merge result back from vXi8 back to VT, working on the lo/hi halves
	// of the current vector width in the same way we did for the nibbles.
	// If the upper half of the input element is zero then add the halves'
	// leading zero counts together, otherwise just use the upper half's.
	// Double the width of the result until we are at target width.
	while (CurrVT != VT) {
	int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
	int CurrNumElts = CurrVT.getVectorNumElements();
	MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
	MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
	SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);

	// Check if the upper half of the input element is zero.
	SDValue HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
	DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
	HiZ = DAG.getBitcast(NextVT, HiZ);

	// Move the upper/lower halves to the lower bits as we'll be extending to
	// NextVT. Mask the lower result to zero if HiZ is true and add the results
	// together.
	SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
	SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
	SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
	R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
	Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
	CurrVT = NextVT;
	}

	return Res;
	}

	static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();

	if (Subtarget.hasCDI())
	return LowerVectorCTLZ_AVX512CDI(Op, DAG);

	// Decompose 256-bit ops into smaller 128-bit ops.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return Lower256IntUnary(Op, DAG);

	// Decompose 512-bit ops into smaller 256-bit ops.
	if (VT.is512BitVector() && !Subtarget.hasBWI())
	return Lower512IntUnary(Op, DAG);

	assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
	return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
	}

	static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	MVT OpVT = VT;
	unsigned NumBits = VT.getSizeInBits();
	SDLoc dl(Op);
	unsigned Opc = Op.getOpcode();

	if (VT.isVector())
	return LowerVectorCTLZ(Op, dl, Subtarget, DAG);

	Op = Op.getOperand(0);
	if (VT == MVT::i8) {
	// Zero extend to i32 since there is not an i8 bsr.
	OpVT = MVT::i32;
	Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
	}

	// Issue a bsr (scan bits in reverse) which also sets EFLAGS.
	SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
	Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);

	if (Opc == ISD::CTLZ) {
	// If src is zero (i.e. bsr sets ZF), returns NumBits.
	SDValue Ops[] = {
	Op,
	DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
	DAG.getConstant(X86::COND_E, dl, MVT::i8),
	Op.getValue(1)
	};
	Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
	}

	// Finally xor with NumBits-1.
	Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
	DAG.getConstant(NumBits - 1, dl, OpVT));

	if (VT == MVT::i8)
	Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
	return Op;
	}

	static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	unsigned NumBits = VT.getScalarSizeInBits();
	SDLoc dl(Op);

	if (VT.isVector()) {
	SDValue N0 = Op.getOperand(0);
	SDValue Zero = DAG.getConstant(0, dl, VT);

	// lsb(x) = (x & -x)
	SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0,
	DAG.getNode(ISD::SUB, dl, VT, Zero, N0));

	// cttz_undef(x) = (width - 1) - ctlz(lsb)
	if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) {
	SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT);
	return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne,
	DAG.getNode(ISD::CTLZ, dl, VT, LSB));
	}

	// cttz(x) = ctpop(lsb - 1)
	SDValue One = DAG.getConstant(1, dl, VT);
	return DAG.getNode(ISD::CTPOP, dl, VT,
	DAG.getNode(ISD::SUB, dl, VT, LSB, One));
	}

	assert(Op.getOpcode() == ISD::CTTZ &&
	"Only scalar CTTZ requires custom lowering");

	// Issue a bsf (scan bits forward) which also sets EFLAGS.
	SDVTList VTs = DAG.getVTList(VT, MVT::i32);
	Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0));

	// If src is zero (i.e. bsf sets ZF), returns NumBits.
	SDValue Ops[] = {
	Op,
	DAG.getConstant(NumBits, dl, VT),
	DAG.getConstant(X86::COND_E, dl, MVT::i8),
	Op.getValue(1)
	};
	return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
	}

	/// Break a 256-bit integer operation into two new 128-bit ones and then
	/// concatenate the result back.
	static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();

	assert(VT.is256BitVector() && VT.isInteger() &&
	"Unsupported value type for operation");

	unsigned NumElems = VT.getVectorNumElements();
	SDLoc dl(Op);

	// Extract the LHS vectors
	SDValue LHS = Op.getOperand(0);
	SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
	SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);

	// Extract the RHS vectors
	SDValue RHS = Op.getOperand(1);
	SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
	SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);

	MVT EltVT = VT.getVectorElementType();
	MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);

	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
	DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
	DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
	}

	/// Break a 512-bit integer operation into two new 256-bit ones and then
	/// concatenate the result back.
	static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();

	assert(VT.is512BitVector() && VT.isInteger() &&
	"Unsupported value type for operation");

	unsigned NumElems = VT.getVectorNumElements();
	SDLoc dl(Op);

	// Extract the LHS vectors
	SDValue LHS = Op.getOperand(0);
	SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
	SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);

	// Extract the RHS vectors
	SDValue RHS = Op.getOperand(1);
	SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
	SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);

	MVT EltVT = VT.getVectorElementType();
	MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);

	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
	DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
	DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
	}

	static SDValue LowerADD_SUB(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	if (VT.getScalarType() == MVT::i1)
	return DAG.getNode(ISD::XOR, SDLoc(Op), VT,
	Op.getOperand(0), Op.getOperand(1));
	assert(Op.getSimpleValueType().is256BitVector() &&
	Op.getSimpleValueType().isInteger() &&
	"Only handle AVX 256-bit vector integer operation");
	return Lower256IntArith(Op, DAG);
	}

	static SDValue LowerABS(SDValue Op, SelectionDAG &DAG) {
	assert(Op.getSimpleValueType().is256BitVector() &&
	Op.getSimpleValueType().isInteger() &&
	"Only handle AVX 256-bit vector integer operation");
	return Lower256IntUnary(Op, DAG);
	}

	static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
	assert(Op.getSimpleValueType().is256BitVector() &&
	Op.getSimpleValueType().isInteger() &&
	"Only handle AVX 256-bit vector integer operation");
	return Lower256IntArith(Op, DAG);
	}

	static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();

	if (VT.getScalarType() == MVT::i1)
	return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));

	// Decompose 256-bit ops into smaller 128-bit ops.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return Lower256IntArith(Op, DAG);

	SDValue A = Op.getOperand(0);
	SDValue B = Op.getOperand(1);

	// Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
	// vector pairs, multiply and truncate.
	if (VT == MVT::v16i8 \|\| VT == MVT::v32i8 \|\| VT == MVT::v64i8) {
	if (Subtarget.hasInt256()) {
	// For 512-bit vectors, split into 256-bit vectors to allow the
	// sign-extension to occur.
	if (VT == MVT::v64i8)
	return Lower512IntArith(Op, DAG);

	// For 256-bit vectors, split into 128-bit vectors to allow the
	// sign-extension to occur. We don't need this on AVX512BW as we can
	// safely sign-extend to v32i16.
	if (VT == MVT::v32i8 && !Subtarget.hasBWI())
	return Lower256IntArith(Op, DAG);

	MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
	return DAG.getNode(
	ISD::TRUNCATE, dl, VT,
	DAG.getNode(ISD::MUL, dl, ExVT,
	DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A),
	DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B)));
	}

	assert(VT == MVT::v16i8 &&
	"Pre-AVX2 support only supports v16i8 multiplication");
	MVT ExVT = MVT::v8i16;

	// Extract the lo parts and sign extend to i16
	SDValue ALo, BLo;
	if (Subtarget.hasSSE41()) {
	ALo = DAG.getSignExtendVectorInReg(A, dl, ExVT);
	BLo = DAG.getSignExtendVectorInReg(B, dl, ExVT);
	} else {
	const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
	-1, 4, -1, 5, -1, 6, -1, 7};
	ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
	BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
	ALo = DAG.getBitcast(ExVT, ALo);
	BLo = DAG.getBitcast(ExVT, BLo);
	ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
	BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
	}

	// Extract the hi parts and sign extend to i16
	SDValue AHi, BHi;
	if (Subtarget.hasSSE41()) {
	const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
	-1, -1, -1, -1, -1, -1, -1, -1};
	AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
	BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
	AHi = DAG.getSignExtendVectorInReg(AHi, dl, ExVT);
	BHi = DAG.getSignExtendVectorInReg(BHi, dl, ExVT);
	} else {
	const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
	-1, 12, -1, 13, -1, 14, -1, 15};
	AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
	BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
	AHi = DAG.getBitcast(ExVT, AHi);
	BHi = DAG.getBitcast(ExVT, BHi);
	AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
	BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
	}

	// Multiply, mask the lower 8bits of the lo/hi results and pack
	SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
	SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
	RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
	RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
	return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
	}

	// Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
	if (VT == MVT::v4i32) {
	assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
	"Should not custom lower when pmuldq is available!");

	// Extract the odd parts.
	static const int UnpackMask[] = { 1, -1, 3, -1 };
	SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
	SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);

	// Multiply the even parts.
	SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
	// Now multiply odd parts.
	SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);

	Evens = DAG.getBitcast(VT, Evens);
	Odds = DAG.getBitcast(VT, Odds);

	// Merge the two vectors back together with a shuffle. This expands into 2
	// shuffles.
	static const int ShufMask[] = { 0, 4, 2, 6 };
	return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
	}

	assert((VT == MVT::v2i64 \|\| VT == MVT::v4i64 \|\| VT == MVT::v8i64) &&
	"Only know how to lower V2I64/V4I64/V8I64 multiply");

	// 32-bit vector types used for MULDQ/MULUDQ.
	MVT MulVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);

	// MULDQ returns the 64-bit result of the signed multiplication of the lower
	// 32-bits. We can lower with this if the sign bits stretch that far.
	if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(A) > 32 &&
	DAG.ComputeNumSignBits(B) > 32) {
	return DAG.getNode(X86ISD::PMULDQ, dl, VT, DAG.getBitcast(MulVT, A),
	DAG.getBitcast(MulVT, B));
	}

	// Ahi = psrlqi(a, 32);
	// Bhi = psrlqi(b, 32);
	//
	// AloBlo = pmuludq(a, b);
	// AloBhi = pmuludq(a, Bhi);
	// AhiBlo = pmuludq(Ahi, b);
	//
	// Hi = psllqi(AloBhi + AhiBlo, 32);
	// return AloBlo + Hi;
	APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
	bool ALoIsZero = DAG.MaskedValueIsZero(A, LowerBitsMask);
	bool BLoIsZero = DAG.MaskedValueIsZero(B, LowerBitsMask);

	APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
	bool AHiIsZero = DAG.MaskedValueIsZero(A, UpperBitsMask);
	bool BHiIsZero = DAG.MaskedValueIsZero(B, UpperBitsMask);

	// Bit cast to 32-bit vectors for MULUDQ.
	SDValue Alo = DAG.getBitcast(MulVT, A);
	SDValue Blo = DAG.getBitcast(MulVT, B);

	SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);

	// Only multiply lo/hi halves that aren't known to be zero.
	SDValue AloBlo = Zero;
	if (!ALoIsZero && !BLoIsZero)
	AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Blo);

	SDValue AloBhi = Zero;
	if (!ALoIsZero && !BHiIsZero) {
	SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
	Bhi = DAG.getBitcast(MulVT, Bhi);
	AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Bhi);
	}

	SDValue AhiBlo = Zero;
	if (!AHiIsZero && !BLoIsZero) {
	SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
	Ahi = DAG.getBitcast(MulVT, Ahi);
	AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, Blo);
	}

	SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
	Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);

	return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
	}

	static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();

	// Decompose 256-bit ops into smaller 128-bit ops.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return Lower256IntArith(Op, DAG);

	// Only i8 vectors should need custom lowering after this.
	assert((VT == MVT::v16i8 \|\| (VT == MVT::v32i8 && Subtarget.hasInt256())) &&
	"Unsupported vector type");

	// Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
	// logical shift down the upper half and pack back to i8.
	SDValue A = Op.getOperand(0);
	SDValue B = Op.getOperand(1);

	// With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
	// and then ashr/lshr the upper bits down to the lower bits before multiply.
	unsigned Opcode = Op.getOpcode();
	unsigned ExShift = (ISD::MULHU == Opcode ? ISD::SRL : ISD::SRA);
	unsigned ExSSE41 = (ISD::MULHU == Opcode ? X86ISD::VZEXT : X86ISD::VSEXT);

	// AVX2 implementations - extend xmm subvectors to ymm.
	if (Subtarget.hasInt256()) {
	SDValue Lo = DAG.getIntPtrConstant(0, dl);
	SDValue Hi = DAG.getIntPtrConstant(VT.getVectorNumElements() / 2, dl);

	if (VT == MVT::v32i8) {
	SDValue ALo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Lo);
	SDValue BLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Lo);
	SDValue AHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Hi);
	SDValue BHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Hi);
	ALo = DAG.getNode(ExSSE41, dl, MVT::v16i16, ALo);
	BLo = DAG.getNode(ExSSE41, dl, MVT::v16i16, BLo);
	AHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, AHi);
	BHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, BHi);
	Lo = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
	DAG.getNode(ISD::MUL, dl, MVT::v16i16, ALo, BLo),
	DAG.getConstant(8, dl, MVT::v16i16));
	Hi = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
	DAG.getNode(ISD::MUL, dl, MVT::v16i16, AHi, BHi),
	DAG.getConstant(8, dl, MVT::v16i16));
	// The ymm variant of PACKUS treats the 128-bit lanes separately, so before
	// using PACKUS we need to permute the inputs to the correct lo/hi xmm lane.
	const int LoMask[] = {0, 1, 2, 3, 4, 5, 6, 7,
	16, 17, 18, 19, 20, 21, 22, 23};
	const int HiMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
	24, 25, 26, 27, 28, 29, 30, 31};
	return DAG.getNode(X86ISD::PACKUS, dl, VT,
	DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, LoMask),
	DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask));
	}

	SDValue ExA = getExtendInVec(ExSSE41, dl, MVT::v16i16, A, DAG);
	SDValue ExB = getExtendInVec(ExSSE41, dl, MVT::v16i16, B, DAG);
	SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);
	SDValue MulH = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul,
	DAG.getConstant(8, dl, MVT::v16i16));
	Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Lo);
	Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Hi);
	return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
	}

	assert(VT == MVT::v16i8 &&
	"Pre-AVX2 support only supports v16i8 multiplication");
	MVT ExVT = MVT::v8i16;

	// Extract the lo parts and zero/sign extend to i16.
	SDValue ALo, BLo;
	if (Subtarget.hasSSE41()) {
	ALo = getExtendInVec(ExSSE41, dl, ExVT, A, DAG);
	BLo = getExtendInVec(ExSSE41, dl, ExVT, B, DAG);
	} else {
	const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
	-1, 4, -1, 5, -1, 6, -1, 7};
	ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
	BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
	ALo = DAG.getBitcast(ExVT, ALo);
	BLo = DAG.getBitcast(ExVT, BLo);
	ALo = DAG.getNode(ExShift, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
	BLo = DAG.getNode(ExShift, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
	}

	// Extract the hi parts and zero/sign extend to i16.
	SDValue AHi, BHi;
	if (Subtarget.hasSSE41()) {
	const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
	-1, -1, -1, -1, -1, -1, -1, -1};
	AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
	BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
	AHi = getExtendInVec(ExSSE41, dl, ExVT, AHi, DAG);
	BHi = getExtendInVec(ExSSE41, dl, ExVT, BHi, DAG);
	} else {
	const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
	-1, 12, -1, 13, -1, 14, -1, 15};
	AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
	BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
	AHi = DAG.getBitcast(ExVT, AHi);
	BHi = DAG.getBitcast(ExVT, BHi);
	AHi = DAG.getNode(ExShift, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
	BHi = DAG.getNode(ExShift, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
	}

	// Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
	// pack back to v16i8.
	SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
	SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
	RLo = DAG.getNode(ISD::SRL, dl, ExVT, RLo, DAG.getConstant(8, dl, ExVT));
	RHi = DAG.getNode(ISD::SRL, dl, ExVT, RHi, DAG.getConstant(8, dl, ExVT));
	return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
	}

	SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
	assert(Subtarget.isTargetWin64() && "Unexpected target");
	EVT VT = Op.getValueType();
	assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
	"Unexpected return type for lowering");

	RTLIB::Libcall LC;
	bool isSigned;
	switch (Op->getOpcode()) {
	default: llvm_unreachable("Unexpected request for libcall!");
	case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
	case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
	case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
	case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
	case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break;
	case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break;
	}

	SDLoc dl(Op);
	SDValue InChain = DAG.getEntryNode();

	TargetLowering::ArgListTy Args;
	TargetLowering::ArgListEntry Entry;
	for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
	EVT ArgVT = Op->getOperand(i).getValueType();
	assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
	"Unexpected argument type for lowering");
	SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
	Entry.Node = StackPtr;
	InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
	MachinePointerInfo(), /* Alignment = */ 16);
	Type ArgTy = ArgVT.getTypeForEVT(DAG.getContext());
	Entry.Ty = PointerType::get(ArgTy,0);
	Entry.IsSExt = false;
	Entry.IsZExt = false;
	Args.push_back(Entry);
	}

	SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
	getPointerTy(DAG.getDataLayout()));

	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(dl)
	.setChain(InChain)
	.setLibCallee(
	getLibcallCallingConv(LC),
	static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
	std::move(Args))
	.setInRegister()
	.setSExtResult(isSigned)
	.setZExtResult(!isSigned);

	std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
	return DAG.getBitcast(VT, CallInfo.first);
	}

	static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
	MVT VT = Op0.getSimpleValueType();
	SDLoc dl(Op);

	// Decompose 256-bit ops into smaller 128-bit ops.
	if (VT.is256BitVector() && !Subtarget.hasInt256()) {
	unsigned Opcode = Op.getOpcode();
	unsigned NumElems = VT.getVectorNumElements();
	MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), NumElems / 2);
	SDValue Lo0 = extract128BitVector(Op0, 0, DAG, dl);
	SDValue Lo1 = extract128BitVector(Op1, 0, DAG, dl);
	SDValue Hi0 = extract128BitVector(Op0, NumElems / 2, DAG, dl);
	SDValue Hi1 = extract128BitVector(Op1, NumElems / 2, DAG, dl);
	SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Lo0, Lo1);
	SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Hi0, Hi1);
	SDValue Ops[] = {
	DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(0), Hi.getValue(0)),
	DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(1), Hi.getValue(1))
	};
	return DAG.getMergeValues(Ops, dl);
	}

	assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) \|\|
	(VT == MVT::v8i32 && Subtarget.hasInt256()));

	// PMULxD operations multiply each even value (starting at 0) of LHS with
	// the related value of RHS and produce a widen result.
	// E.g., PMULUDQ <4 x i32> <a\|b\|c\|d>, <4 x i32> <e\|f\|g\|h>
	// => <2 x i64> <ae\|cg>
	//
	// In other word, to have all the results, we need to perform two PMULxD:
	// 1. one with the even values.
	// 2. one with the odd values.
	// To achieve #2, with need to place the odd values at an even position.
	//
	// Place the odd value at an even position (basically, shift all values 1
	// step to the left):
	const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1};
	// <a\|b\|c\|d> => <b\|undef\|d\|undef>
	SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0,
	makeArrayRef(&Mask[0], VT.getVectorNumElements()));
	// <e\|f\|g\|h> => <f\|undef\|h\|undef>
	SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1,
	makeArrayRef(&Mask[0], VT.getVectorNumElements()));

	// Emit two multiplies, one for the lower 2 ints and one for the higher 2
	// ints.
	MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
	bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
	unsigned Opcode =
	(!IsSigned \|\| !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
	// PMULUDQ <4 x i32> <a\|b\|c\|d>, <4 x i32> <e\|f\|g\|h>
	// => <2 x i64> <ae\|cg>
	SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
	// PMULUDQ <4 x i32> <b\|undef\|d\|undef>, <4 x i32> <f\|undef\|h\|undef>
	// => <2 x i64> <bf\|dh>
	SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));

	// Shuffle it back into the right order.
	SDValue Highs, Lows;
	if (VT == MVT::v8i32) {
	const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15};
	Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
	const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14};
	Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
	} else {
	const int HighMask[] = {1, 5, 3, 7};
	Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
	const int LowMask[] = {0, 4, 2, 6};
	Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
	}

	// If we have a signed multiply but no PMULDQ fix up the high parts of a
	// unsigned multiply.
	if (IsSigned && !Subtarget.hasSSE41()) {
	SDValue ShAmt = DAG.getConstant(
	31, dl,
	DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));
	SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
	DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
	SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
	DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);

	SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
	Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
	}

	// The first result of MUL_LOHI is actually the low value, followed by the
	// high value.
	SDValue Ops[] = {Lows, Highs};
	return DAG.getMergeValues(Ops, dl);
	}

	// Return true if the required (according to Opcode) shift-imm form is natively
	// supported by the Subtarget
	static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
	unsigned Opcode) {
	if (VT.getScalarSizeInBits() < 16)
	return false;

	if (VT.is512BitVector() && Subtarget.hasAVX512() &&
	(VT.getScalarSizeInBits() > 16 \|\| Subtarget.hasBWI()))
	return true;

	bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) \|\|
	(VT.is256BitVector() && Subtarget.hasInt256());

	bool AShift = LShift && (Subtarget.hasAVX512() \|\|
	(VT != MVT::v2i64 && VT != MVT::v4i64));
	return (Opcode == ISD::SRA) ? AShift : LShift;
	}

	// The shift amount is a variable, but it is the same for all vector lanes.
	// These instructions are defined together with shift-immediate.
	static
	bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
	unsigned Opcode) {
	return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
	}

	// Return true if the required (according to Opcode) variable-shift form is
	// natively supported by the Subtarget
	static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
	unsigned Opcode) {

	if (!Subtarget.hasInt256() \|\| VT.getScalarSizeInBits() < 16)
	return false;

	// vXi16 supported only on AVX-512, BWI
	if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
	return false;

	if (Subtarget.hasAVX512())
	return true;

	bool LShift = VT.is128BitVector() \|\| VT.is256BitVector();
	bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
	return (Opcode == ISD::SRA) ? AShift : LShift;
	}

	static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);
	SDValue R = Op.getOperand(0);
	SDValue Amt = Op.getOperand(1);

	unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
	(Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;

	auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
	assert((VT == MVT::v2i64 \|\| VT == MVT::v4i64) && "Unexpected SRA type");
	MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
	SDValue Ex = DAG.getBitcast(ExVT, R);

	// ashr(R, 63) === cmp_slt(R, 0)
	if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
	assert((VT != MVT::v4i64 \|\| Subtarget.hasInt256()) &&
	"Unsupported PCMPGT op");
	return DAG.getNode(X86ISD::PCMPGT, dl, VT,
	getZeroVector(VT, Subtarget, DAG, dl), R);
	}

	if (ShiftAmt >= 32) {
	// Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
	SDValue Upper =
	getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
	SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
	ShiftAmt - 32, DAG);
	if (VT == MVT::v2i64)
	Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
	if (VT == MVT::v4i64)
	Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
	{9, 1, 11, 3, 13, 5, 15, 7});
	} else {
	// SRA upper i32, SHL whole i64 and select lower i32.
	SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
	ShiftAmt, DAG);
	SDValue Lower =
	getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
	Lower = DAG.getBitcast(ExVT, Lower);
	if (VT == MVT::v2i64)
	Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
	if (VT == MVT::v4i64)
	Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
	{8, 1, 10, 3, 12, 5, 14, 7});
	}
	return DAG.getBitcast(VT, Ex);
	};

	// Optimize shl/srl/sra with constant shift amount.
	if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
	if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
	uint64_t ShiftAmt = ShiftConst->getZExtValue();

	if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
	return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);

	// i64 SRA needs to be performed as partial shifts.
	if (((!Subtarget.hasXOP() && VT == MVT::v2i64) \|\|
	(Subtarget.hasInt256() && VT == MVT::v4i64)) &&
	Op.getOpcode() == ISD::SRA)
	return ArithmeticShiftRight64(ShiftAmt);

	if (VT == MVT::v16i8 \|\|
	(Subtarget.hasInt256() && VT == MVT::v32i8) \|\|
	VT == MVT::v64i8) {
	unsigned NumElts = VT.getVectorNumElements();
	MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

	// Simple i8 add case
	if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
	return DAG.getNode(ISD::ADD, dl, VT, R, R);

	// ashr(R, 7) === cmp_slt(R, 0)
	if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
	SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
	if (VT.is512BitVector()) {
	assert(VT == MVT::v64i8 && "Unexpected element type!");
	SDValue CMP = DAG.getNode(X86ISD::PCMPGTM, dl, MVT::v64i1, Zeros, R);
	return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
	}
	return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
	}

	// XOP can shift v16i8 directly instead of as shift v8i16 + mask.
	if (VT == MVT::v16i8 && Subtarget.hasXOP())
	return SDValue();

	if (Op.getOpcode() == ISD::SHL) {
	// Make a large shift.
	SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
	R, ShiftAmt, DAG);
	SHL = DAG.getBitcast(VT, SHL);
	// Zero out the rightmost bits.
	return DAG.getNode(ISD::AND, dl, VT, SHL,
	DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT));
	}
	if (Op.getOpcode() == ISD::SRL) {
	// Make a large shift.
	SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
	R, ShiftAmt, DAG);
	SRL = DAG.getBitcast(VT, SRL);
	// Zero out the leftmost bits.
	return DAG.getNode(ISD::AND, dl, VT, SRL,
	DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
	}
	if (Op.getOpcode() == ISD::SRA) {
	// ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
	SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);

	SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
	Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
	Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
	return Res;
	}
	llvm_unreachable("Unknown shift opcode.");
	}
	}
	}

	// Special case in 32-bit mode, where i64 is expanded into high and low parts.
	// TODO: Replace constant extraction with getTargetConstantBitsFromNode.
	if (!Subtarget.is64Bit() && !Subtarget.hasXOP() &&
	(VT == MVT::v2i64 \|\| (Subtarget.hasInt256() && VT == MVT::v4i64) \|\|
	(Subtarget.hasAVX512() && VT == MVT::v8i64))) {

	// AVX1 targets maybe extracting a 128-bit vector from a 256-bit constant.
	unsigned SubVectorScale = 1;
	if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
	SubVectorScale =
	Amt.getOperand(0).getValueSizeInBits() / Amt.getValueSizeInBits();
	Amt = Amt.getOperand(0);
	}

	// Peek through any splat that was introduced for i64 shift vectorization.
	int SplatIndex = -1;
	if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode()))
	if (SVN->isSplat()) {
	SplatIndex = SVN->getSplatIndex();
	Amt = Amt.getOperand(0);
	assert(SplatIndex < (int)VT.getVectorNumElements() &&
	"Splat shuffle referencing second operand");
	}

	if (Amt.getOpcode() != ISD::BITCAST \|\|
	Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR)
	return SDValue();

	Amt = Amt.getOperand(0);
	unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
	(SubVectorScale * VT.getVectorNumElements());
	unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
	uint64_t ShiftAmt = 0;
	unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio);
	for (unsigned i = 0; i != Ratio; ++i) {
	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + BaseOp));
	if (!C)
	return SDValue();
	// 6 == Log2(64)
	ShiftAmt \|= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
	}

	// Check remaining shift amounts (if not a splat).
	if (SplatIndex < 0) {
	for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
	uint64_t ShAmt = 0;
	for (unsigned j = 0; j != Ratio; ++j) {
	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
	if (!C)
	return SDValue();
	// 6 == Log2(64)
	ShAmt \|= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
	}
	if (ShAmt != ShiftAmt)
	return SDValue();
	}
	}

	if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
	return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);

	if (Op.getOpcode() == ISD::SRA)
	return ArithmeticShiftRight64(ShiftAmt);
	}

	return SDValue();
	}

	static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);
	SDValue R = Op.getOperand(0);
	SDValue Amt = Op.getOperand(1);

	unsigned X86OpcI = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
	(Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;

	unsigned X86OpcV = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHL :
	(Op.getOpcode() == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA;

	if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) {
	SDValue BaseShAmt;
	MVT EltVT = VT.getVectorElementType();

	if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
	// Check if this build_vector node is doing a splat.
	// If so, then set BaseShAmt equal to the splat value.
	BaseShAmt = BV->getSplatValue();
	if (BaseShAmt && BaseShAmt.isUndef())
	BaseShAmt = SDValue();
	} else {
	if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
	Amt = Amt.getOperand(0);

	ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
	if (SVN && SVN->isSplat()) {
	unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
	SDValue InVec = Amt.getOperand(0);
	if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
	assert((SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) &&
	"Unexpected shuffle index found!");
	BaseShAmt = InVec.getOperand(SplatIdx);
	} else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
	if (ConstantSDNode *C =
	dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
	if (C->getZExtValue() == SplatIdx)
	BaseShAmt = InVec.getOperand(1);
	}
	}

	if (!BaseShAmt)
	// Avoid introducing an extract element from a shuffle.
	BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
	DAG.getIntPtrConstant(SplatIdx, dl));
	}
	}

	if (BaseShAmt.getNode()) {
	assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
	if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
	BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
	else if (EltVT.bitsLT(MVT::i32))
	BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);

	return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
	}
	}

	// Special case in 32-bit mode, where i64 is expanded into high and low parts.
	if (!Subtarget.is64Bit() && VT == MVT::v2i64 &&
	Amt.getOpcode() == ISD::BITCAST &&
	Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
	Amt = Amt.getOperand(0);
	unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
	VT.getVectorNumElements();
	std::vector<SDValue> Vals(Ratio);
	for (unsigned i = 0; i != Ratio; ++i)
	Vals[i] = Amt.getOperand(i);
	for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
	for (unsigned j = 0; j != Ratio; ++j)
	if (Vals[j] != Amt.getOperand(i + j))
	return SDValue();
	}

	if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
	return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
	}
	return SDValue();
	}

	static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);
	SDValue R = Op.getOperand(0);
	SDValue Amt = Op.getOperand(1);
	bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());

	assert(VT.isVector() && "Custom lowering only for vector shifts!");
	assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");

	if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
	return V;

	if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
	return V;

	if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode()))
	return Op;

	// XOP has 128-bit variable logical/arithmetic shifts.
	// +ve/-ve Amt = shift left/right.
	if (Subtarget.hasXOP() &&
	(VT == MVT::v2i64 \|\| VT == MVT::v4i32 \|\|
	VT == MVT::v8i16 \|\| VT == MVT::v16i8)) {
	if (Op.getOpcode() == ISD::SRL \|\| Op.getOpcode() == ISD::SRA) {
	SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
	Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
	}
	if (Op.getOpcode() == ISD::SHL \|\| Op.getOpcode() == ISD::SRL)
	return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
	if (Op.getOpcode() == ISD::SRA)
	return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
	}

	// 2i64 vector logical shifts can efficiently avoid scalarization - do the
	// shifts per-lane and then shuffle the partial results back together.
	if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
	// Splat the shift amounts so the scalar shifts above will catch it.
	SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
	SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
	SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);
	SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);
	return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
	}

	// i64 vector arithmetic shift can be emulated with the transform:
	// M = lshr(SIGN_MASK, Amt)
	// ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
	if ((VT == MVT::v2i64 \|\| (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
	Op.getOpcode() == ISD::SRA) {
	SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
	SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
	R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
	R = DAG.getNode(ISD::XOR, dl, VT, R, M);
	R = DAG.getNode(ISD::SUB, dl, VT, R, M);
	return R;
	}

	// If possible, lower this packed shift into a vector multiply instead of
	// expanding it into a sequence of scalar shifts.
	// Do this only if the vector shift count is a constant build_vector.
	if (ConstantAmt && Op.getOpcode() == ISD::SHL &&
	(VT == MVT::v8i16 \|\| VT == MVT::v4i32 \|\|
	(Subtarget.hasInt256() && VT == MVT::v16i16))) {
	SmallVector<SDValue, 8> Elts;
	MVT SVT = VT.getVectorElementType();
	unsigned SVTBits = SVT.getSizeInBits();
	APInt One(SVTBits, 1);
	unsigned NumElems = VT.getVectorNumElements();

	for (unsigned i=0; i !=NumElems; ++i) {
	SDValue Op = Amt->getOperand(i);
	if (Op->isUndef()) {
	Elts.push_back(Op);
	continue;
	}

	ConstantSDNode *ND = cast<ConstantSDNode>(Op);
	APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
	uint64_t ShAmt = C.getZExtValue();
	if (ShAmt >= SVTBits) {
	Elts.push_back(DAG.getUNDEF(SVT));
	continue;
	}
	Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
	}
	SDValue BV = DAG.getBuildVector(VT, dl, Elts);
	return DAG.getNode(ISD::MUL, dl, VT, R, BV);
	}

	// Lower SHL with variable shift amount.
	if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
	Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));

	Op = DAG.getNode(ISD::ADD, dl, VT, Op,
	DAG.getConstant(0x3f800000U, dl, VT));
	Op = DAG.getBitcast(MVT::v4f32, Op);
	Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
	return DAG.getNode(ISD::MUL, dl, VT, Op, R);
	}

	// If possible, lower this shift as a sequence of two shifts by
	// constant plus a MOVSS/MOVSD/PBLEND instead of scalarizing it.
	// Example:
	// (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
	//
	// Could be rewritten as:
	// (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
	//
	// The advantage is that the two shifts from the example would be
	// lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
	// the vector shift into four scalar shifts plus four pairs of vector
	// insert/extract.
	if (ConstantAmt && (VT == MVT::v8i16 \|\| VT == MVT::v4i32)) {
	unsigned TargetOpcode = X86ISD::MOVSS;
	bool CanBeSimplified;
	// The splat value for the first packed shift (the 'X' from the example).
	SDValue Amt1 = Amt->getOperand(0);
	// The splat value for the second packed shift (the 'Y' from the example).
	SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : Amt->getOperand(2);

	// See if it is possible to replace this node with a sequence of
	// two shifts followed by a MOVSS/MOVSD/PBLEND.
	if (VT == MVT::v4i32) {
	// Check if it is legal to use a MOVSS.
	CanBeSimplified = Amt2 == Amt->getOperand(2) &&
	Amt2 == Amt->getOperand(3);
	if (!CanBeSimplified) {
	// Otherwise, check if we can still simplify this node using a MOVSD.
	CanBeSimplified = Amt1 == Amt->getOperand(1) &&
	Amt->getOperand(2) == Amt->getOperand(3);
	TargetOpcode = X86ISD::MOVSD;
	Amt2 = Amt->getOperand(2);
	}
	} else {
	// Do similar checks for the case where the machine value type
	// is MVT::v8i16.
	CanBeSimplified = Amt1 == Amt->getOperand(1);
	for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
	CanBeSimplified = Amt2 == Amt->getOperand(i);

	if (!CanBeSimplified) {
	TargetOpcode = X86ISD::MOVSD;
	CanBeSimplified = true;
	Amt2 = Amt->getOperand(4);
	for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
	CanBeSimplified = Amt1 == Amt->getOperand(i);
	for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
	CanBeSimplified = Amt2 == Amt->getOperand(j);
	}
	}

	if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
	isa<ConstantSDNode>(Amt2)) {
	// Replace this node with two shifts followed by a MOVSS/MOVSD/PBLEND.
	MVT CastVT = MVT::v4i32;
	SDValue Splat1 =
	DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
	SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
	SDValue Splat2 =
	DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);
	SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
	SDValue BitCast1 = DAG.getBitcast(CastVT, Shift1);
	SDValue BitCast2 = DAG.getBitcast(CastVT, Shift2);
	if (TargetOpcode == X86ISD::MOVSD)
	return DAG.getBitcast(VT, DAG.getVectorShuffle(CastVT, dl, BitCast1,
	BitCast2, {0, 1, 6, 7}));
	return DAG.getBitcast(VT, DAG.getVectorShuffle(CastVT, dl, BitCast1,
	BitCast2, {0, 5, 6, 7}));
	}
	}

	// v4i32 Non Uniform Shifts.
	// If the shift amount is constant we can shift each lane using the SSE2
	// immediate shifts, else we need to zero-extend each lane to the lower i64
	// and shift using the SSE2 variable shifts.
	// The separate results can then be blended together.
	if (VT == MVT::v4i32) {
	unsigned Opc = Op.getOpcode();
	SDValue Amt0, Amt1, Amt2, Amt3;
	if (ConstantAmt) {
	Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
	Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
	Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
	Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
	} else {
	// ISD::SHL is handled above but we include it here for completeness.
	switch (Opc) {
	default:
	llvm_unreachable("Unknown target vector shift node");
	case ISD::SHL:
	Opc = X86ISD::VSHL;
	break;
	case ISD::SRL:
	Opc = X86ISD::VSRL;
	break;
	case ISD::SRA:
	Opc = X86ISD::VSRA;
	break;
	}
	// The SSE2 shifts use the lower i64 as the same shift amount for
	// all lanes and the upper i64 is ignored. These shuffle masks
	// optimally zero-extend each lanes on SSE2/SSE41/AVX targets.
	SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
	Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
	Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
	Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
	Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
	}

	SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
	SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
	SDValue R2 = DAG.getNode(Opc, dl, VT, R, Amt2);
	SDValue R3 = DAG.getNode(Opc, dl, VT, R, Amt3);
	SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
	SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
	return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
	}

	// It's worth extending once and using the vXi16/vXi32 shifts for smaller
	// types, but without AVX512 the extra overheads to get from vXi8 to vXi32
	// make the existing SSE solution better.
	if ((Subtarget.hasInt256() && VT == MVT::v8i16) \|\|
	(Subtarget.hasAVX512() && VT == MVT::v16i16) \|\|
	(Subtarget.hasAVX512() && VT == MVT::v16i8) \|\|
	(Subtarget.hasBWI() && VT == MVT::v32i8)) {
	MVT EvtSVT = (VT == MVT::v32i8 ? MVT::i16 : MVT::i32);
	MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
	unsigned ExtOpc =
	Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
	R = DAG.getNode(ExtOpc, dl, ExtVT, R);
	Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt);
	return DAG.getNode(ISD::TRUNCATE, dl, VT,
	DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
	}

	if (VT == MVT::v16i8 \|\|
	(VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) \|\|
	(VT == MVT::v64i8 && Subtarget.hasBWI())) {
	MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
	unsigned ShiftOpcode = Op->getOpcode();

	auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
	if (VT.is512BitVector()) {
	// On AVX512BW targets we make use of the fact that VSELECT lowers
	// to a masked blend which selects bytes based just on the sign bit
	// extracted to a mask.
	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
	V0 = DAG.getBitcast(VT, V0);
	V1 = DAG.getBitcast(VT, V1);
	Sel = DAG.getBitcast(VT, Sel);
	Sel = DAG.getNode(X86ISD::CVT2MASK, dl, MaskVT, Sel);
	return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
	} else if (Subtarget.hasSSE41()) {
	// On SSE41 targets we make use of the fact that VSELECT lowers
	// to PBLENDVB which selects bytes based just on the sign bit.
	V0 = DAG.getBitcast(VT, V0);
	V1 = DAG.getBitcast(VT, V1);
	Sel = DAG.getBitcast(VT, Sel);
	return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
	}
	// On pre-SSE41 targets we test for the sign bit by comparing to
	// zero - a negative value will set all bits of the lanes to true
	// and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
	SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl);
	SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
	return DAG.getSelect(dl, SelVT, C, V0, V1);
	};

	// Turn 'a' into a mask suitable for VSELECT: a = a << 5;
	// We can safely do this using i16 shifts as we're only interested in
	// the 3 lower bits of each byte.
	Amt = DAG.getBitcast(ExtVT, Amt);
	Amt = DAG.getNode(ISD::SHL, dl, ExtVT, Amt, DAG.getConstant(5, dl, ExtVT));
	Amt = DAG.getBitcast(VT, Amt);

	if (Op->getOpcode() == ISD::SHL \|\| Op->getOpcode() == ISD::SRL) {
	// r = VSELECT(r, shift(r, 4), a);
	SDValue M =
	DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
	R = SignBitSelect(VT, Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

	// r = VSELECT(r, shift(r, 2), a);
	M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
	R = SignBitSelect(VT, Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

	// return VSELECT(r, shift(r, 1), a);
	M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
	R = SignBitSelect(VT, Amt, M, R);
	return R;
	}

	if (Op->getOpcode() == ISD::SRA) {
	// For SRA we need to unpack each byte to the higher byte of a i16 vector
	// so we can correctly sign extend. We don't care what happens to the
	// lower byte.
	SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt);
	SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt);
	SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R);
	SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R);
	ALo = DAG.getBitcast(ExtVT, ALo);
	AHi = DAG.getBitcast(ExtVT, AHi);
	RLo = DAG.getBitcast(ExtVT, RLo);
	RHi = DAG.getBitcast(ExtVT, RHi);

	// r = VSELECT(r, shift(r, 4), a);
	SDValue MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
	DAG.getConstant(4, dl, ExtVT));
	SDValue MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
	DAG.getConstant(4, dl, ExtVT));
	RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
	RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);

	// a += a
	ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
	AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);

	// r = VSELECT(r, shift(r, 2), a);
	MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
	DAG.getConstant(2, dl, ExtVT));
	MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
	DAG.getConstant(2, dl, ExtVT));
	RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
	RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);

	// a += a
	ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
	AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);

	// r = VSELECT(r, shift(r, 1), a);
	MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
	DAG.getConstant(1, dl, ExtVT));
	MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
	DAG.getConstant(1, dl, ExtVT));
	RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
	RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);

	// Logical shift the result back to the lower byte, leaving a zero upper
	// byte
	// meaning that we can safely pack with PACKUSWB.
	RLo =
	DAG.getNode(ISD::SRL, dl, ExtVT, RLo, DAG.getConstant(8, dl, ExtVT));
	RHi =
	DAG.getNode(ISD::SRL, dl, ExtVT, RHi, DAG.getConstant(8, dl, ExtVT));
	return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
	}
	}

	if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
	MVT ExtVT = MVT::v8i32;
	SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
	SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
	SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z);
	SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Z, R);
	SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Z, R);
	ALo = DAG.getBitcast(ExtVT, ALo);
	AHi = DAG.getBitcast(ExtVT, AHi);
	RLo = DAG.getBitcast(ExtVT, RLo);
	RHi = DAG.getBitcast(ExtVT, RHi);
	SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo);
	SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi);
	Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT));
	Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT));
	return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
	}

	if (VT == MVT::v8i16) {
	unsigned ShiftOpcode = Op->getOpcode();

	// If we have a constant shift amount, the non-SSE41 path is best as
	// avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
	bool UseSSE41 = Subtarget.hasSSE41() &&
	!ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());

	auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
	// On SSE41 targets we make use of the fact that VSELECT lowers
	// to PBLENDVB which selects bytes based just on the sign bit.
	if (UseSSE41) {
	MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
	V0 = DAG.getBitcast(ExtVT, V0);
	V1 = DAG.getBitcast(ExtVT, V1);
	Sel = DAG.getBitcast(ExtVT, Sel);
	return DAG.getBitcast(VT, DAG.getSelect(dl, ExtVT, Sel, V0, V1));
	}
	// On pre-SSE41 targets we splat the sign bit - a negative value will
	// set all bits of the lanes to true and VSELECT uses that in
	// its OR(AND(V0,C),AND(V1,~C)) lowering.
	SDValue C =
	DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT));
	return DAG.getSelect(dl, VT, C, V0, V1);
	};

	// Turn 'a' into a mask suitable for VSELECT: a = a << 12;
	if (UseSSE41) {
	// On SSE41 targets we need to replicate the shift mask in both
	// bytes for PBLENDVB.
	Amt = DAG.getNode(
	ISD::OR, dl, VT,
	DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(4, dl, VT)),
	DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT)));
	} else {
	Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT));
	}

	// r = VSELECT(r, shift(r, 8), a);
	SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(8, dl, VT));
	R = SignBitSelect(Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

	// r = VSELECT(r, shift(r, 4), a);
	M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
	R = SignBitSelect(Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

	// r = VSELECT(r, shift(r, 2), a);
	M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
	R = SignBitSelect(Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

	// return VSELECT(r, shift(r, 1), a);
	M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
	R = SignBitSelect(Amt, M, R);
	return R;
	}

	// Decompose 256-bit shifts into smaller 128-bit shifts.
	if (VT.is256BitVector())
	return Lower256IntArith(Op, DAG);

	return SDValue();
	}

	static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	SDLoc DL(Op);
	SDValue R = Op.getOperand(0);
	SDValue Amt = Op.getOperand(1);
	unsigned Opcode = Op.getOpcode();
	unsigned EltSizeInBits = VT.getScalarSizeInBits();

	if (Subtarget.hasAVX512()) {
	// Attempt to rotate by immediate.
	APInt UndefElts;
	SmallVector<APInt, 16> EltBits;
	if (getTargetConstantBitsFromNode(Amt, EltSizeInBits, UndefElts, EltBits)) {
	if (!UndefElts && llvm::all_of(EltBits, [EltBits](APInt &V) {
	return EltBits[0] == V;
	})) {
	unsigned Op = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
	uint64_t RotateAmt = EltBits[0].urem(EltSizeInBits);
	return DAG.getNode(Op, DL, VT, R,
	DAG.getConstant(RotateAmt, DL, MVT::i8));
	}
	}

	// Else, fall-back on VPROLV/VPRORV.
	return Op;
	}

	assert(VT.isVector() && "Custom lowering only for vector rotates!");
	assert(Subtarget.hasXOP() && "XOP support required for vector rotates!");
	assert((Opcode == ISD::ROTL) && "Only ROTL supported");

	// XOP has 128-bit vector variable + immediate rotates.
	// +ve/-ve Amt = rotate left/right.

	// Split 256-bit integers.
	if (VT.is256BitVector())
	return Lower256IntArith(Op, DAG);

	assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");

	// Attempt to rotate by immediate.
	if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
	if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
	uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
	assert(RotateAmt < EltSizeInBits && "Rotation out of range");
	return DAG.getNode(X86ISD::VPROTI, DL, VT, R,
	DAG.getConstant(RotateAmt, DL, MVT::i8));
	}
	}

	// Use general rotate by variable (per-element).
	return DAG.getNode(X86ISD::VPROT, DL, VT, R, Amt);
	}

	static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
	// Lower the "add/sub/mul with overflow" instruction into a regular ins plus
	// a "setcc" instruction that checks the overflow flag. The "brcond" lowering
	// looks for this combo and may remove the "setcc" instruction if the "setcc"
	// has only one use.
	SDNode *N = Op.getNode();
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);
	unsigned BaseOp = 0;
	X86::CondCode Cond;
	SDLoc DL(Op);
	switch (Op.getOpcode()) {
	default: llvm_unreachable("Unknown ovf instruction!");
	case ISD::SADDO:
	// A subtract of one will be selected as a INC. Note that INC doesn't
	// set CF, so we can't do this for UADDO.
	if (isOneConstant(RHS)) {
	BaseOp = X86ISD::INC;
	Cond = X86::COND_O;
	break;
	}
	BaseOp = X86ISD::ADD;
	Cond = X86::COND_O;
	break;
	case ISD::UADDO:
	BaseOp = X86ISD::ADD;
	Cond = X86::COND_B;
	break;
	case ISD::SSUBO:
	// A subtract of one will be selected as a DEC. Note that DEC doesn't
	// set CF, so we can't do this for USUBO.
	if (isOneConstant(RHS)) {
	BaseOp = X86ISD::DEC;
	Cond = X86::COND_O;
	break;
	}
	BaseOp = X86ISD::SUB;
	Cond = X86::COND_O;
	break;
	case ISD::USUBO:
	BaseOp = X86ISD::SUB;
	Cond = X86::COND_B;
	break;
	case ISD::SMULO:
	BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
	Cond = X86::COND_O;
	break;
	case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
	if (N->getValueType(0) == MVT::i8) {
	BaseOp = X86ISD::UMUL8;
	Cond = X86::COND_O;
	break;
	}
	SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
	MVT::i32);
	SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);

	SDValue SetCC = getSETCC(X86::COND_O, SDValue(Sum.getNode(), 2), DL, DAG);

	if (N->getValueType(1) == MVT::i1)
	SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);

	return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
	}
	}

	// Also sets EFLAGS.
	SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
	SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);

	SDValue SetCC = getSETCC(Cond, SDValue(Sum.getNode(), 1), DL, DAG);

	if (N->getValueType(1) == MVT::i1)
	SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);

	return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
	}

	/// Returns true if the operand type is exactly twice the native width, and
	/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
	/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
	/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
	bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
	unsigned OpWidth = MemType->getPrimitiveSizeInBits();

	if (OpWidth == 64)
	return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
	else if (OpWidth == 128)
	return Subtarget.hasCmpxchg16b();
	else
	return false;
	}

	bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
	return needsCmpXchgNb(SI->getValueOperand()->getType());
	}

	// Note: this turns large loads into lock cmpxchg8b/16b.
	// FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
	TargetLowering::AtomicExpansionKind
	X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
	auto PTy = cast<PointerType>(LI->getPointerOperandType());
	return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
	: AtomicExpansionKind::None;
	}

	TargetLowering::AtomicExpansionKind
	X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
	unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
	Type *MemType = AI->getType();

	// If the operand is too big, we must see if cmpxchg8/16b is available
	// and default to library calls otherwise.
	if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
	return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
	: AtomicExpansionKind::None;
	}

	AtomicRMWInst::BinOp Op = AI->getOperation();
	switch (Op) {
	default:
	llvm_unreachable("Unknown atomic operation");
	case AtomicRMWInst::Xchg:
	case AtomicRMWInst::Add:
	case AtomicRMWInst::Sub:
	// It's better to use xadd, xsub or xchg for these in all cases.
	return AtomicExpansionKind::None;
	case AtomicRMWInst::Or:
	case AtomicRMWInst::And:
	case AtomicRMWInst::Xor:
	// If the atomicrmw's result isn't actually used, we can just add a "lock"
	// prefix to a normal instruction for these operations.
	return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
	: AtomicExpansionKind::None;
	case AtomicRMWInst::Nand:
	case AtomicRMWInst::Max:
	case AtomicRMWInst::Min:
	case AtomicRMWInst::UMax:
	case AtomicRMWInst::UMin:
	// These always require a non-trivial set of data operations on x86. We must
	// use a cmpxchg loop.
	return AtomicExpansionKind::CmpXChg;
	}
	}

	LoadInst *
	X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
	unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
	Type *MemType = AI->getType();
	// Accesses larger than the native width are turned into cmpxchg/libcalls, so
	// there is no benefit in turning such RMWs into loads, and it is actually
	// harmful as it introduces a mfence.
	if (MemType->getPrimitiveSizeInBits() > NativeWidth)
	return nullptr;

	auto Builder = IRBuilder<>(AI);
	Module *M = Builder.GetInsertBlock()->getParent()->getParent();
	auto SSID = AI->getSyncScopeID();
	// We must restrict the ordering to avoid generating loads with Release or
	// ReleaseAcquire orderings.
	auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
	auto Ptr = AI->getPointerOperand();

	// Before the load we need a fence. Here is an example lifted from
	// http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
	// is required:
	// Thread 0:
	// x.store(1, relaxed);
	// r1 = y.fetch_add(0, release);
	// Thread 1:
	// y.fetch_add(42, acquire);
	// r2 = x.load(relaxed);
	// r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
	// lowered to just a load without a fence. A mfence flushes the store buffer,
	// making the optimization clearly correct.
	// FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
	// otherwise, we might be able to be more aggressive on relaxed idempotent
	// rmw. In practice, they do not look useful, so we don't try to be
	// especially clever.
	if (SSID == SyncScope::SingleThread)
	// FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
	// the IR level, so we must wrap it in an intrinsic.
	return nullptr;

	if (!Subtarget.hasMFence())
	// FIXME: it might make sense to use a locked operation here but on a
	// different cache-line to prevent cache-line bouncing. In practice it
	// is probably a small win, and x86 processors without mfence are rare
	// enough that we do not bother.
	return nullptr;

	Function *MFence =
	llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
	Builder.CreateCall(MFence, {});

	// Finally we can emit the atomic load.
	LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
	AI->getType()->getPrimitiveSizeInBits());
	Loaded->setAtomic(Order, SSID);
	AI->replaceAllUsesWith(Loaded);
	AI->eraseFromParent();
	return Loaded;
	}

	static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDLoc dl(Op);
	AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
	cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
	SyncScope::ID FenceSSID = static_cast<SyncScope::ID>(
	cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());

	// The only fence that needs an instruction is a sequentially-consistent
	// cross-thread fence.
	if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
	FenceSSID == SyncScope::System) {
	if (Subtarget.hasMFence())
	return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));

	SDValue Chain = Op.getOperand(0);
	SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
	SDValue Ops[] = {
	DAG.getRegister(X86::ESP, MVT::i32), // Base
	DAG.getTargetConstant(1, dl, MVT::i8), // Scale
	DAG.getRegister(0, MVT::i32), // Index
	DAG.getTargetConstant(0, dl, MVT::i32), // Disp
	DAG.getRegister(0, MVT::i32), // Segment.
	Zero,
	Chain
	};
	SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
	return SDValue(Res, 0);
	}

	// MEMBARRIER is a compiler barrier; it codegens to a no-op.
	return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
	}

	static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT T = Op.getSimpleValueType();
	SDLoc DL(Op);
	unsigned Reg = 0;
	unsigned size = 0;
	switch(T.SimpleTy) {
	default: llvm_unreachable("Invalid value type!");
	case MVT::i8: Reg = X86::AL; size = 1; break;
	case MVT::i16: Reg = X86::AX; size = 2; break;
	case MVT::i32: Reg = X86::EAX; size = 4; break;
	case MVT::i64:
	assert(Subtarget.is64Bit() && "Node not type legal!");
	Reg = X86::RAX; size = 8;
	break;
	}
	SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
	Op.getOperand(2), SDValue());
	SDValue Ops[] = { cpIn.getValue(0),
	Op.getOperand(1),
	Op.getOperand(3),
	DAG.getTargetConstant(size, DL, MVT::i8),
	cpIn.getValue(1) };
	SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
	MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
	SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
	Ops, T, MMO);

	SDValue cpOut =
	DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
	SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
	MVT::i32, cpOut.getValue(2));
	SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);

	DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
	DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
	DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
	return SDValue();
	}

	static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT SrcVT = Op.getOperand(0).getSimpleValueType();
	MVT DstVT = Op.getSimpleValueType();

	if (SrcVT == MVT::v2i32 \|\| SrcVT == MVT::v4i16 \|\| SrcVT == MVT::v8i8 \|\|
	SrcVT == MVT::i64) {
	assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
	if (DstVT != MVT::f64)
	// This conversion needs to be expanded.
	return SDValue();

	SDValue Op0 = Op->getOperand(0);
	SmallVector<SDValue, 16> Elts;
	SDLoc dl(Op);
	unsigned NumElts;
	MVT SVT;
	if (SrcVT.isVector()) {
	NumElts = SrcVT.getVectorNumElements();
	SVT = SrcVT.getVectorElementType();

	// Widen the vector in input in the case of MVT::v2i32.
	// Example: from MVT::v2i32 to MVT::v4i32.
	for (unsigned i = 0, e = NumElts; i != e; ++i)
	Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Op0,
	DAG.getIntPtrConstant(i, dl)));
	} else {
	assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
	"Unexpected source type in LowerBITCAST");
	Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
	DAG.getIntPtrConstant(0, dl)));
	Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
	DAG.getIntPtrConstant(1, dl)));
	NumElts = 2;
	SVT = MVT::i32;
	}
	// Explicitly mark the extra elements as Undef.
	Elts.append(NumElts, DAG.getUNDEF(SVT));

	EVT NewVT = EVT::getVectorVT(DAG.getContext(), SVT, NumElts 2);
	SDValue BV = DAG.getBuildVector(NewVT, dl, Elts);
	SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
	DAG.getIntPtrConstant(0, dl));
	}

	assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() &&
	Subtarget.hasMMX() && "Unexpected custom BITCAST");
	assert((DstVT == MVT::i64 \|\|
	(DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
	"Unexpected custom BITCAST");
	// i64 <=> MMX conversions are Legal.
	if (SrcVT==MVT::i64 && DstVT.isVector())
	return Op;
	if (DstVT==MVT::i64 && SrcVT.isVector())
	return Op;
	// MMX <=> MMX conversions are Legal.
	if (SrcVT.isVector() && DstVT.isVector())
	return Op;
	// All other conversions need to be expanded.
	return SDValue();
	}

	/// Compute the horizontal sum of bytes in V for the elements of VT.
	///
	/// Requires V to be a byte vector and VT to be an integer vector type with
	/// wider elements than V's type. The width of the elements of VT determines
	/// how many bytes of V are summed horizontally to produce each element of the
	/// result.
	static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDLoc DL(V);
	MVT ByteVecVT = V.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
	"Expected value to have byte element type.");
	assert(EltVT != MVT::i8 &&
	"Horizontal byte sum only makes sense for wider elements!");
	unsigned VecSize = VT.getSizeInBits();
	assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");

	// PSADBW instruction horizontally add all bytes and leave the result in i64
	// chunks, thus directly computes the pop count for v2i64 and v4i64.
	if (EltVT == MVT::i64) {
	SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
	MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
	V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
	return DAG.getBitcast(VT, V);
	}

	if (EltVT == MVT::i32) {
	// We unpack the low half and high half into i32s interleaved with zeros so
	// that we can use PSADBW to horizontally sum them. The most useful part of
	// this is that it lines up the results of two PSADBW instructions to be
	// two v2i64 vectors which concatenated are the 4 population counts. We can
	// then use PACKUSWB to shrink and concatenate them into a v4i32 again.
	SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);
	SDValue V32 = DAG.getBitcast(VT, V);
	SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V32, Zeros);
	SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V32, Zeros);

	// Do the horizontal sums into two v2i64s.
	Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
	MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
	Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
	DAG.getBitcast(ByteVecVT, Low), Zeros);
	High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
	DAG.getBitcast(ByteVecVT, High), Zeros);

	// Merge them together.
	MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
	V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
	DAG.getBitcast(ShortVecVT, Low),
	DAG.getBitcast(ShortVecVT, High));

	return DAG.getBitcast(VT, V);
	}

	// The only element type left is i16.
	assert(EltVT == MVT::i16 && "Unknown how to handle type");

	// To obtain pop count for each i16 element starting from the pop count for
	// i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
	// right by 8. It is important to shift as i16s as i8 vector shift isn't
	// directly supported.
	SDValue ShifterV = DAG.getConstant(8, DL, VT);
	SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
	V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
	DAG.getBitcast(ByteVecVT, V));
	return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
	}

	static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	unsigned VecSize = VT.getSizeInBits();

	// Implement a lookup table in register by using an algorithm based on:
	// http://wm.ite.pl/articles/sse-popcount.html
	//
	// The general idea is that every lower byte nibble in the input vector is an
	// index into a in-register pre-computed pop count table. We then split up the
	// input vector in two new ones: (1) a vector with only the shifted-right
	// higher nibbles for each byte and (2) a vector with the lower nibbles (and
	// masked out higher ones) for each byte. PSHUFB is used separately with both
	// to index the in-register table. Next, both are added and the result is a
	// i8 vector where each element contains the pop count for input byte.
	//
	// To obtain the pop count for elements != i8, we follow up with the same
	// approach and use additional tricks as described below.
	//
	const int LUT[16] = {/* 0 / 0, / 1 / 1, / 2 / 1, / 3 */ 2,
	/* 4 / 1, / 5 / 2, / 6 / 2, / 7 */ 3,
	/* 8 / 1, / 9 / 2, / a / 2, / b */ 3,
	/* c / 2, / d / 3, / e / 3, / f */ 4};

	int NumByteElts = VecSize / 8;
	MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
	SDValue In = DAG.getBitcast(ByteVecVT, Op);
	SmallVector<SDValue, 64> LUTVec;
	for (int i = 0; i < NumByteElts; ++i)
	LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
	SDValue InRegLUT = DAG.getBuildVector(ByteVecVT, DL, LUTVec);
	SDValue M0F = DAG.getConstant(0x0F, DL, ByteVecVT);

	// High nibbles
	SDValue FourV = DAG.getConstant(4, DL, ByteVecVT);
	SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);

	// Low nibbles
	SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F);

	// The input vector is used as the shuffle mask that index elements into the
	// LUT. After counting low and high nibbles, add the vector to obtain the
	// final pop count per i8 element.
	SDValue HighPopCnt =
	DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);
	SDValue LowPopCnt =
	DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);
	SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt);

	if (EltVT == MVT::i8)
	return PopCnt;

	return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
	}

	static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	assert(VT.is128BitVector() &&
	"Only 128-bit vector bitmath lowering supported.");

	int VecSize = VT.getSizeInBits();
	MVT EltVT = VT.getVectorElementType();
	int Len = EltVT.getSizeInBits();

	// This is the vectorized version of the "best" algorithm from
	// http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
	// with a minor tweak to use a series of adds + shifts instead of vector
	// multiplications. Implemented for all integer vector types. We only use
	// this when we don't have SSSE3 which allows a LUT-based lowering that is
	// much faster, even faster than using native popcnt instructions.

	auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {
	MVT VT = V.getSimpleValueType();
	SDValue ShifterV = DAG.getConstant(Shifter, DL, VT);
	return DAG.getNode(OpCode, DL, VT, V, ShifterV);
	};
	auto GetMask = [&](SDValue V, APInt Mask) {
	MVT VT = V.getSimpleValueType();
	SDValue MaskV = DAG.getConstant(Mask, DL, VT);
	return DAG.getNode(ISD::AND, DL, VT, V, MaskV);
	};

	// We don't want to incur the implicit masks required to SRL vNi8 vectors on
	// x86, so set the SRL type to have elements at least i16 wide. This is
	// correct because all of our SRLs are followed immediately by a mask anyways
	// that handles any bits that sneak into the high bits of the byte elements.
	MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16);

	SDValue V = Op;

	// v = v - ((v >> 1) & 0x55555555...)
	SDValue Srl =
	DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));
	SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55)));
	V = DAG.getNode(ISD::SUB, DL, VT, V, And);

	// v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
	SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33)));
	Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));
	SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33)));
	V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);

	// v = (v + (v >> 4)) & 0x0F0F0F0F...
	Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));
	SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);
	V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F)));

	// At this point, V contains the byte-wise population count, and we are
	// merely doing a horizontal sum if necessary to get the wider element
	// counts.
	if (EltVT == MVT::i8)
	return V;

	return LowerHorizontalByteSum(
	DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,
	DAG);
	}

	// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
	// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
	static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	assert((VT.is512BitVector() \|\| VT.is256BitVector() \|\| VT.is128BitVector()) &&
	"Unknown CTPOP type to handle");
	SDLoc DL(Op.getNode());
	SDValue Op0 = Op.getOperand(0);

	// TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
	if (Subtarget.hasVPOPCNTDQ()) {
	if (VT == MVT::v8i16) {
	Op = DAG.getNode(X86ISD::VZEXT, DL, MVT::v8i64, Op0);
	Op = DAG.getNode(ISD::CTPOP, DL, MVT::v8i64, Op);
	return DAG.getNode(X86ISD::VTRUNC, DL, VT, Op);
	}
	if (VT == MVT::v16i8 \|\| VT == MVT::v16i16) {
	Op = DAG.getNode(X86ISD::VZEXT, DL, MVT::v16i32, Op0);
	Op = DAG.getNode(ISD::CTPOP, DL, MVT::v16i32, Op);
	return DAG.getNode(X86ISD::VTRUNC, DL, VT, Op);
	}
	}

	if (!Subtarget.hasSSSE3()) {
	// We can't use the fast LUT approach, so fall back on vectorized bitmath.
	assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
	return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
	}

	// Decompose 256-bit ops into smaller 128-bit ops.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return Lower256IntUnary(Op, DAG);

	// Decompose 512-bit ops into smaller 256-bit ops.
	if (VT.is512BitVector() && !Subtarget.hasBWI())
	return Lower512IntUnary(Op, DAG);

	return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
	}

	static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Op.getSimpleValueType().isVector() &&
	"We only do custom lowering for vector population count.");
	return LowerVectorCTPOP(Op, Subtarget, DAG);
	}

	static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	SDValue In = Op.getOperand(0);
	SDLoc DL(Op);

	// For scalars, its still beneficial to transfer to/from the SIMD unit to
	// perform the BITREVERSE.
	if (!VT.isVector()) {
	MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
	SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
	Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
	DAG.getIntPtrConstant(0, DL));
	}

	int NumElts = VT.getVectorNumElements();
	int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;

	// Decompose 256-bit ops into smaller 128-bit ops.
	if (VT.is256BitVector())
	return Lower256IntUnary(Op, DAG);

	assert(VT.is128BitVector() &&
	"Only 128-bit vector bitreverse lowering supported.");

	// VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
	// perform the BSWAP in the shuffle.
	// Its best to shuffle using the second operand as this will implicitly allow
	// memory folding for multiple vectors.
	SmallVector<SDValue, 16> MaskElts;
	for (int i = 0; i != NumElts; ++i) {
	for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
	int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
	int PermuteByte = SourceByte \| (2 << 5);
	MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
	}
	}

	SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
	SDValue Res = DAG.getBitcast(MVT::v16i8, In);
	Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
	Res, Mask);
	return DAG.getBitcast(VT, Res);
	}

	static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	if (Subtarget.hasXOP())
	return LowerBITREVERSE_XOP(Op, DAG);

	assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");

	MVT VT = Op.getSimpleValueType();
	SDValue In = Op.getOperand(0);
	SDLoc DL(Op);

	unsigned NumElts = VT.getVectorNumElements();
	assert(VT.getScalarType() == MVT::i8 &&
	"Only byte vector BITREVERSE supported");

	// Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return Lower256IntUnary(Op, DAG);

	// Perform BITREVERSE using PSHUFB lookups. Each byte is split into
	// two nibbles and a PSHUFB lookup to find the bitreverse of each
	// 0-15 value (moved to the other nibble).
	SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
	SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
	SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));

	const int LoLUT[16] = {
	/* 0 / 0x00, / 1 / 0x80, / 2 / 0x40, / 3 */ 0xC0,
	/* 4 / 0x20, / 5 / 0xA0, / 6 / 0x60, / 7 */ 0xE0,
	/* 8 / 0x10, / 9 / 0x90, / a / 0x50, / b */ 0xD0,
	/* c / 0x30, / d / 0xB0, / e / 0x70, / f */ 0xF0};
	const int HiLUT[16] = {
	/* 0 / 0x00, / 1 / 0x08, / 2 / 0x04, / 3 */ 0x0C,
	/* 4 / 0x02, / 5 / 0x0A, / 6 / 0x06, / 7 */ 0x0E,
	/* 8 / 0x01, / 9 / 0x09, / a / 0x05, / b */ 0x0D,
	/* c / 0x03, / d / 0x0B, / e / 0x07, / f */ 0x0F};

	SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
	for (unsigned i = 0; i < NumElts; ++i) {
	LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
	HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
	}

	SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
	SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
	Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
	Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
	return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
	}

	static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG) {
	unsigned NewOpc = 0;
	switch (N->getOpcode()) {
	case ISD::ATOMIC_LOAD_ADD:
	NewOpc = X86ISD::LADD;
	break;
	case ISD::ATOMIC_LOAD_SUB:
	NewOpc = X86ISD::LSUB;
	break;
	case ISD::ATOMIC_LOAD_OR:
	NewOpc = X86ISD::LOR;
	break;
	case ISD::ATOMIC_LOAD_XOR:
	NewOpc = X86ISD::LXOR;
	break;
	case ISD::ATOMIC_LOAD_AND:
	NewOpc = X86ISD::LAND;
	break;
	default:
	llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
	}

	MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
	return DAG.getMemIntrinsicNode(
	NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
	{N->getOperand(0), N->getOperand(1), N->getOperand(2)},
	/MemVT=/N->getSimpleValueType(0), MMO);
	}

	/// Lower atomic_load_ops into LOCK-prefixed operations.
	static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue Chain = N->getOperand(0);
	SDValue LHS = N->getOperand(1);
	SDValue RHS = N->getOperand(2);
	unsigned Opc = N->getOpcode();
	MVT VT = N->getSimpleValueType(0);
	SDLoc DL(N);

	// We can lower atomic_load_add into LXADD. However, any other atomicrmw op
	// can only be lowered when the result is unused. They should have already
	// been transformed into a cmpxchg loop in AtomicExpand.
	if (N->hasAnyUseOfValue(0)) {
	// Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
	// select LXADD if LOCK_SUB can't be selected.
	if (Opc == ISD::ATOMIC_LOAD_SUB) {
	AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
	RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
	return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
	RHS, AN->getMemOperand());
	}
	assert(Opc == ISD::ATOMIC_LOAD_ADD &&
	"Used AtomicRMW ops other than Add should have been expanded!");
	return N;
	}

	SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG);
	// RAUW the chain, but don't worry about the result, as it's unused.
	assert(!N->hasAnyUseOfValue(0));
	DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1));
	return SDValue();
	}

	static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
	SDNode *Node = Op.getNode();
	SDLoc dl(Node);
	EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();

	// Convert seq_cst store -> xchg
	// Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
	// FIXME: On 32-bit, store -> fist or movq would be more efficient
	// (The only way to get a 16-byte store is cmpxchg16b)
	// FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
	if (cast<AtomicSDNode>(Node)->getOrdering() ==
	AtomicOrdering::SequentiallyConsistent \|\|
	!DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
	SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
	cast<AtomicSDNode>(Node)->getMemoryVT(),
	Node->getOperand(0),
	Node->getOperand(1), Node->getOperand(2),
	cast<AtomicSDNode>(Node)->getMemOperand());
	return Swap.getValue(1);
	}
	// Other atomic stores have a simple pattern.
	return Op;
	}

	static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
	SDNode *N = Op.getNode();
	MVT VT = N->getSimpleValueType(0);

	// Let legalize expand this if it isn't a legal type yet.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
	return SDValue();

	SDVTList VTs = DAG.getVTList(VT, MVT::i32);
	SDLoc DL(N);

	// Set the carry flag.
	SDValue Carry = Op.getOperand(2);
	EVT CarryVT = Carry.getValueType();
	APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
	Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
	Carry, DAG.getConstant(NegOne, DL, CarryVT));

	unsigned Opc = Op.getOpcode() == ISD::ADDCARRY ? X86ISD::ADC : X86ISD::SBB;
	SDValue Sum = DAG.getNode(Opc, DL, VTs, Op.getOperand(0),
	Op.getOperand(1), Carry.getValue(1));

	SDValue SetCC = getSETCC(X86::COND_B, Sum.getValue(1), DL, DAG);
	if (N->getValueType(1) == MVT::i1)
	SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);

	return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
	}

	static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());

	// For MacOSX, we want to call an alternative entry point: __sincos_stret,
	// which returns the values as { float, float } (in XMM0) or
	// { double, double } (which is returned in XMM0, XMM1).
	SDLoc dl(Op);
	SDValue Arg = Op.getOperand(0);
	EVT ArgVT = Arg.getValueType();
	Type ArgTy = ArgVT.getTypeForEVT(DAG.getContext());

	TargetLowering::ArgListTy Args;
	TargetLowering::ArgListEntry Entry;

	Entry.Node = Arg;
	Entry.Ty = ArgTy;
	Entry.IsSExt = false;
	Entry.IsZExt = false;
	Args.push_back(Entry);

	bool isF64 = ArgVT == MVT::f64;
	// Only optimize x86_64 for now. i386 is a bit messy. For f32,
	// the small struct {f32, f32} is returned in (eax, edx). For f64,
	// the results are returned via SRet in memory.
	const char *LibcallName = isF64 ? "__sincos_stret" : "__sincosf_stret";
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDValue Callee =
	DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));

	Type RetTy = isF64 ? (Type )StructType::get(ArgTy, ArgTy)
	: (Type *)VectorType::get(ArgTy, 4);

	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(dl)
	.setChain(DAG.getEntryNode())
	.setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));

	std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);

	if (isF64)
	// Returned in xmm0 and xmm1.
	return CallResult.first;

	// Returned in bits 0:31 and 32:64 xmm0.
	SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
	CallResult.first, DAG.getIntPtrConstant(0, dl));
	SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
	CallResult.first, DAG.getIntPtrConstant(1, dl));
	SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
	return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
	}

	/// Widen a vector input to a vector of NVT. The
	/// input vector must have the same element type as NVT.
	static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
	bool FillWithZeroes = false) {
	// Check if InOp already has the right width.
	MVT InVT = InOp.getSimpleValueType();
	if (InVT == NVT)
	return InOp;

	if (InOp.isUndef())
	return DAG.getUNDEF(NVT);

	assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
	"input and widen element type must match");

	unsigned InNumElts = InVT.getVectorNumElements();
	unsigned WidenNumElts = NVT.getVectorNumElements();
	assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
	"Unexpected request for vector widening");

	SDLoc dl(InOp);
	if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
	InOp.getNumOperands() == 2) {
	SDValue N1 = InOp.getOperand(1);
	if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) \|\|
	N1.isUndef()) {
	InOp = InOp.getOperand(0);
	InVT = InOp.getSimpleValueType();
	InNumElts = InVT.getVectorNumElements();
	}
	}
	if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) \|\|
	ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
	SmallVector<SDValue, 16> Ops;
	for (unsigned i = 0; i < InNumElts; ++i)
	Ops.push_back(InOp.getOperand(i));

	EVT EltVT = InOp.getOperand(0).getValueType();

	SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
	DAG.getUNDEF(EltVT);
	for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
	Ops.push_back(FillVal);
	return DAG.getBuildVector(NVT, dl, Ops);
	}
	SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
	DAG.getUNDEF(NVT);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
	InOp, DAG.getIntPtrConstant(0, dl));
	}

	static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.hasAVX512() &&
	"MGATHER/MSCATTER are supported on AVX-512 arch only");

	// X86 scatter kills mask register, so its type should be added to
	// the list of return values.
	// If the "scatter" has 2 return values, it is already handled.
	if (Op.getNode()->getNumValues() == 2)
	return Op;

	MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
	SDValue Src = N->getValue();
	MVT VT = Src.getSimpleValueType();
	assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
	SDLoc dl(Op);

	SDValue NewScatter;
	SDValue Index = N->getIndex();
	SDValue Mask = N->getMask();
	SDValue Chain = N->getChain();
	SDValue BasePtr = N->getBasePtr();
	MVT MemVT = N->getMemoryVT().getSimpleVT();
	MVT IndexVT = Index.getSimpleValueType();
	MVT MaskVT = Mask.getSimpleValueType();

	if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) {
	// The v2i32 value was promoted to v2i64.
	// Now we "redo" the type legalizer's work and widen the original
	// v2i32 value to v4i32. The original v2i32 is retrieved from v2i64
	// with a shuffle.
	assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) &&
	"Unexpected memory type");
	int ShuffleMask[] = {0, 2, -1, -1};
	Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src),
	DAG.getUNDEF(MVT::v4i32), ShuffleMask);
	// Now we have 4 elements instead of 2.
	// Expand the index.
	MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4);
	Index = ExtendToType(Index, NewIndexVT, DAG);

	// Expand the mask with zeroes
	// Mask may be <2 x i64> or <2 x i1> at this moment
	assert((MaskVT == MVT::v2i1 \|\| MaskVT == MVT::v2i64) &&
	"Unexpected mask type");
	MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4);
	Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
	VT = MVT::v4i32;
	}

	unsigned NumElts = VT.getVectorNumElements();
	if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
	!Index.getSimpleValueType().is512BitVector()) {
	// AVX512F supports only 512-bit vectors. Or data or index should
	// be 512 bit wide. If now the both index and data are 256-bit, but
	// the vector contains 8 elements, we just sign-extend the index
	if (IndexVT == MVT::v8i32)
	// Just extend index
	Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
	else {
	// The minimal number of elts in scatter is 8
	NumElts = 8;
	// Index
	MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
	// Use original index here, do not modify the index twice
	Index = ExtendToType(N->getIndex(), NewIndexVT, DAG);
	if (IndexVT.getScalarType() == MVT::i32)
	Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);

	// Mask
	// At this point we have promoted mask operand
	assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
	MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
	// Use the original mask here, do not modify the mask twice
	Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true);

	// The value that should be stored
	MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
	Src = ExtendToType(Src, NewVT, DAG);
	}
	}
	// If the mask is "wide" at this point - truncate it to i1 vector
	MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts);
	Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask);

	// The mask is killed by scatter, add it to the values
	SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other);
	SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index};
	NewScatter = DAG.getMaskedScatter(VTs, N->getMemoryVT(), dl, Ops,
	N->getMemOperand());
	DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
	return SDValue(NewScatter.getNode(), 1);
	}

	static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {

	MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
	MVT VT = Op.getSimpleValueType();
	MVT ScalarVT = VT.getScalarType();
	SDValue Mask = N->getMask();
	SDLoc dl(Op);

	assert((!N->isExpandingLoad() \|\| Subtarget.hasAVX512()) &&
	"Expanding masked load is supported on AVX-512 target only!");

	assert((!N->isExpandingLoad() \|\| ScalarVT.getSizeInBits() >= 32) &&
	"Expanding masked load is supported for 32 and 64-bit types only!");

	// 4x32, 4x64 and 2x64 vectors of non-expanding loads are legal regardless of
	// VLX. These types for exp-loads are handled here.
	if (!N->isExpandingLoad() && VT.getVectorNumElements() <= 4)
	return Op;

	assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
	"Cannot lower masked load op.");

	assert((ScalarVT.getSizeInBits() >= 32 \|\|
	(Subtarget.hasBWI() &&
	(ScalarVT == MVT::i8 \|\| ScalarVT == MVT::i16))) &&
	"Unsupported masked load op.");

	// This operation is legal for targets with VLX, but without
	// VLX the vector should be widened to 512 bit
	unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
	MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
	SDValue Src0 = N->getSrc0();
	Src0 = ExtendToType(Src0, WideDataVT, DAG);

	// Mask element has to be i1.
	MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
	assert((MaskEltTy == MVT::i1 \|\| VT.getVectorNumElements() <= 4) &&
	"We handle 4x32, 4x64 and 2x64 vectors only in this case");

	MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);

	Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
	if (MaskEltTy != MVT::i1)
	Mask = DAG.getNode(ISD::TRUNCATE, dl,
	MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
	SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
	N->getBasePtr(), Mask, Src0,
	N->getMemoryVT(), N->getMemOperand(),
	N->getExtensionType(),
	N->isExpandingLoad());

	SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
	NewLoad.getValue(0),
	DAG.getIntPtrConstant(0, dl));
	SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
	return DAG.getMergeValues(RetOps, dl);
	}

	static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
	SDValue DataToStore = N->getValue();
	MVT VT = DataToStore.getSimpleValueType();
	MVT ScalarVT = VT.getScalarType();
	SDValue Mask = N->getMask();
	SDLoc dl(Op);

	assert((!N->isCompressingStore() \|\| Subtarget.hasAVX512()) &&
	"Expanding masked load is supported on AVX-512 target only!");

	assert((!N->isCompressingStore() \|\| ScalarVT.getSizeInBits() >= 32) &&
	"Expanding masked load is supported for 32 and 64-bit types only!");

	// 4x32 and 2x64 vectors of non-compressing stores are legal regardless to VLX.
	if (!N->isCompressingStore() && VT.getVectorNumElements() <= 4)
	return Op;

	assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
	"Cannot lower masked store op.");

	assert((ScalarVT.getSizeInBits() >= 32 \|\|
	(Subtarget.hasBWI() &&
	(ScalarVT == MVT::i8 \|\| ScalarVT == MVT::i16))) &&
	"Unsupported masked store op.");

	// This operation is legal for targets with VLX, but without
	// VLX the vector should be widened to 512 bit
	unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
	MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);

	// Mask element has to be i1.
	MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
	assert((MaskEltTy == MVT::i1 \|\| VT.getVectorNumElements() <= 4) &&
	"We handle 4x32, 4x64 and 2x64 vectors only in this case");

	MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);

	DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
	Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
	if (MaskEltTy != MVT::i1)
	Mask = DAG.getNode(ISD::TRUNCATE, dl,
	MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
	return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
	Mask, N->getMemoryVT(), N->getMemOperand(),
	N->isTruncatingStore(), N->isCompressingStore());
	}

	static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.hasAVX512() &&
	"MGATHER/MSCATTER are supported on AVX-512 arch only");

	MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();
	SDValue Index = N->getIndex();
	SDValue Mask = N->getMask();
	SDValue Src0 = N->getValue();
	MVT IndexVT = Index.getSimpleValueType();
	MVT MaskVT = Mask.getSimpleValueType();

	unsigned NumElts = VT.getVectorNumElements();
	assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");

	if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
	!Index.getSimpleValueType().is512BitVector()) {
	// AVX512F supports only 512-bit vectors. Or data or index should
	// be 512 bit wide. If now the both index and data are 256-bit, but
	// the vector contains 8 elements, we just sign-extend the index
	if (NumElts == 8) {
	Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
	SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
	N->getOperand(3), Index };
	DAG.UpdateNodeOperands(N, Ops);
	return Op;
	}

	// Minimal number of elements in Gather
	NumElts = 8;
	// Index
	MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
	Index = ExtendToType(Index, NewIndexVT, DAG);
	if (IndexVT.getScalarType() == MVT::i32)
	Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);

	// Mask
	MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts);
	// At this point we have promoted mask operand
	assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
	MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
	Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
	Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask);

	// The pass-through value
	MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
	Src0 = ExtendToType(Src0, NewVT, DAG);

	SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
	SDValue NewGather = DAG.getMaskedGather(DAG.getVTList(NewVT, MVT::Other),
	N->getMemoryVT(), dl, Ops,
	N->getMemOperand());
	SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
	NewGather.getValue(0),
	DAG.getIntPtrConstant(0, dl));
	SDValue RetOps[] = {Exract, NewGather.getValue(1)};
	return DAG.getMergeValues(RetOps, dl);
	}
	if (N->getMemoryVT() == MVT::v2i32 && Subtarget.hasVLX()) {
	// There is a special case when the return type is v2i32 is illegal and
	// the type legaizer extended it to v2i64. Without this conversion we end up
	// with VPGATHERQQ (reading q-words from the memory) instead of VPGATHERQD.
	// In order to avoid this situation, we'll build an X86 specific Gather node
	// with index v2i64 and value type v4i32.
	assert(VT == MVT::v2i64 && Src0.getValueType() == MVT::v2i64 &&
	"Unexpected type in masked gather");
	Src0 = DAG.getVectorShuffle(MVT::v4i32, dl,
	DAG.getBitcast(MVT::v4i32, Src0),
	DAG.getUNDEF(MVT::v4i32), { 0, 2, -1, -1 });
	// The mask should match the destination type. Extending mask with zeroes
	// is not necessary since instruction itself reads only two values from
	// memory.
	Mask = ExtendToType(Mask, MVT::v4i1, DAG, false);
	SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
	SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
	DAG.getVTList(MVT::v4i32, MVT::Other), Ops, dl, N->getMemoryVT(),
	N->getMemOperand());

	SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, MVT::v2i64,
	NewGather.getValue(0), DAG);
	SDValue RetOps[] = { Sext, NewGather.getValue(1) };
	return DAG.getMergeValues(RetOps, dl);
	}
	if (N->getMemoryVT() == MVT::v2f32 && Subtarget.hasVLX()) {
	// This transformation is for optimization only.
	// The type legalizer extended mask and index to 4 elements vector
	// in order to match requirements of the common gather node - same
	// vector width of index and value. X86 Gather node allows mismatch
	// of vector width in order to select more optimal instruction at the
	// end.
	assert(VT == MVT::v4f32 && Src0.getValueType() == MVT::v4f32 &&
	"Unexpected type in masked gather");
	if (Mask.getOpcode() == ISD::CONCAT_VECTORS &&
	ISD::isBuildVectorAllZeros(Mask.getOperand(1).getNode()) &&
	Index.getOpcode() == ISD::CONCAT_VECTORS &&
	Index.getOperand(1).isUndef()) {
	Mask = ExtendToType(Mask.getOperand(0), MVT::v4i1, DAG, false);
	Index = Index.getOperand(0);
	} else
	return Op;
	SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
	SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
	DAG.getVTList(MVT::v4f32, MVT::Other), Ops, dl, N->getMemoryVT(),
	N->getMemOperand());

	SDValue RetOps[] = { NewGather.getValue(0), NewGather.getValue(1) };
	return DAG.getMergeValues(RetOps, dl);

	}
	return Op;
	}

	SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
	SelectionDAG &DAG) const {
	// TODO: Eventually, the lowering of these nodes should be informed by or
	// deferred to the GC strategy for the function in which they appear. For
	// now, however, they must be lowered to something. Since they are logically
	// no-ops in the case of a null GC strategy (or a GC strategy which does not
	// require special handling for these nodes), lower them as literal NOOPs for
	// the time being.
	SmallVector<SDValue, 2> Ops;

	Ops.push_back(Op.getOperand(0));
	if (Op->getGluedNode())
	Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));

	SDLoc OpDL(Op);
	SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
	SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);

	return NOOP;
	}

	SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
	SelectionDAG &DAG) const {
	// TODO: Eventually, the lowering of these nodes should be informed by or
	// deferred to the GC strategy for the function in which they appear. For
	// now, however, they must be lowered to something. Since they are logically
	// no-ops in the case of a null GC strategy (or a GC strategy which does not
	// require special handling for these nodes), lower them as literal NOOPs for
	// the time being.
	SmallVector<SDValue, 2> Ops;

	Ops.push_back(Op.getOperand(0));
	if (Op->getGluedNode())
	Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));

	SDLoc OpDL(Op);
	SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
	SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);

	return NOOP;
	}

	/// Provide custom lowering hooks for some operations.
	SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
	switch (Op.getOpcode()) {
	default: llvm_unreachable("Should not custom lower this!");
	case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
	case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
	return LowerCMP_SWAP(Op, Subtarget, DAG);
	case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
	case ISD::ATOMIC_LOAD_ADD:
	case ISD::ATOMIC_LOAD_SUB:
	case ISD::ATOMIC_LOAD_OR:
	case ISD::ATOMIC_LOAD_XOR:
	case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
	case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG);
	case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
	case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
	case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
	case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG);
	case ISD::VSELECT: return LowerVSELECT(Op, DAG);
	case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
	case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
	case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
	case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
	case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
	case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
	case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
	case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
	case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
	case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
	case ISD::SHL_PARTS:
	case ISD::SRA_PARTS:
	case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
	case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
	case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
	case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
	case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
	case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
	case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
	case ISD::ZERO_EXTEND_VECTOR_INREG:
	case ISD::SIGN_EXTEND_VECTOR_INREG:
	return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
	case ISD::FP_TO_SINT:
	case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
	case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
	case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG);
	case ISD::FABS:
	case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
	case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
	case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
	case ISD::SETCC: return LowerSETCC(Op, DAG);
	case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
	case ISD::SELECT: return LowerSELECT(Op, DAG);
	case ISD::BRCOND: return LowerBRCOND(Op, DAG);
	case ISD::JumpTable: return LowerJumpTable(Op, DAG);
	case ISD::VASTART: return LowerVASTART(Op, DAG);
	case ISD::VAARG: return LowerVAARG(Op, DAG);
	case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
	case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG);
	case ISD::INTRINSIC_VOID:
	case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
	case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
	case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
	case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
	case ISD::FRAME_TO_ARGS_OFFSET:
	return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
	case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
	case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
	case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
	case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
	case ISD::EH_SJLJ_SETUP_DISPATCH:
	return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
	case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
	case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
	case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
	case ISD::CTLZ:
	case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
	case ISD::CTTZ:
	case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, DAG);
	case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
	case ISD::MULHS:
	case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
	case ISD::UMUL_LOHI:
	case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG);
	case ISD::ROTL:
	case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
	case ISD::SRA:
	case ISD::SRL:
	case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
	case ISD::SADDO:
	case ISD::UADDO:
	case ISD::SSUBO:
	case ISD::USUBO:
	case ISD::SMULO:
	case ISD::UMULO: return LowerXALUO(Op, DAG);
	case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
	case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
	case ISD::ADDCARRY:
	case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
	case ISD::ADD:
	case ISD::SUB: return LowerADD_SUB(Op, DAG);
	case ISD::SMAX:
	case ISD::SMIN:
	case ISD::UMAX:
	case ISD::UMIN: return LowerMINMAX(Op, DAG);
	case ISD::ABS: return LowerABS(Op, DAG);
	case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
	case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
	case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
	case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
	case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
	case ISD::GC_TRANSITION_START:
	return LowerGC_TRANSITION_START(Op, DAG);
	case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG);
	case ISD::STORE: return LowerTruncatingStore(Op, Subtarget, DAG);
	}
	}

	/// Places new result values for the node in Results (their number
	/// and types must exactly match those of the original return values of
	/// the node), or leaves Results empty, which indicates that the node is not
	/// to be custom lowered after all.
	void X86TargetLowering::LowerOperationWrapper(SDNode *N,
	SmallVectorImpl<SDValue> &Results,
	SelectionDAG &DAG) const {
	SDValue Res = LowerOperation(SDValue(N, 0), DAG);

	if (!Res.getNode())
	return;

	assert((N->getNumValues() <= Res->getNumValues()) &&
	"Lowering returned the wrong number of results!");

	// Places new result values base on N result number.
	// In some cases (LowerSINT_TO_FP for example) Res has more result values
	// than original node, chain should be dropped(last value).
	for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
	Results.push_back(Res.getValue(I));
	}

	/// Replace a node with an illegal result type with a new node built out of
	/// custom code.
	void X86TargetLowering::ReplaceNodeResults(SDNode *N,
	SmallVectorImpl<SDValue>&Results,
	SelectionDAG &DAG) const {
	SDLoc dl(N);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	switch (N->getOpcode()) {
	default:
	llvm_unreachable("Do not know how to custom type legalize this operation!");
	case X86ISD::AVG: {
	// Legalize types for X86ISD::AVG by expanding vectors.
	assert(Subtarget.hasSSE2() && "Requires at least SSE2!");

	auto InVT = N->getValueType(0);
	auto InVTSize = InVT.getSizeInBits();
	const unsigned RegSize =
	(InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128;
	assert((Subtarget.hasBWI() \|\| RegSize < 512) &&
	"512-bit vector requires AVX512BW");
	assert((Subtarget.hasAVX2() \|\| RegSize < 256) &&
	"256-bit vector requires AVX2");

	auto ElemVT = InVT.getVectorElementType();
	auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT,
	RegSize / ElemVT.getSizeInBits());
	assert(RegSize % InVT.getSizeInBits() == 0);
	unsigned NumConcat = RegSize / InVT.getSizeInBits();

	SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
	Ops[0] = N->getOperand(0);
	SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
	Ops[0] = N->getOperand(1);
	SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);

	SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1);
	Results.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res,
	DAG.getIntPtrConstant(0, dl)));
	return;
	}
	// We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
	case X86ISD::FMINC:
	case X86ISD::FMIN:
	case X86ISD::FMAXC:
	case X86ISD::FMAX: {
	EVT VT = N->getValueType(0);
	assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
	SDValue UNDEF = DAG.getUNDEF(VT);
	SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
	N->getOperand(0), UNDEF);
	SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
	N->getOperand(1), UNDEF);
	Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
	return;
	}
	case ISD::SDIV:
	case ISD::UDIV:
	case ISD::SREM:
	case ISD::UREM:
	case ISD::SDIVREM:
	case ISD::UDIVREM: {
	SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
	Results.push_back(V);
	return;
	}
	case ISD::FP_TO_SINT:
	case ISD::FP_TO_UINT: {
	bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;

	if (N->getValueType(0) == MVT::v2i32) {
	assert((IsSigned \|\| Subtarget.hasAVX512()) &&
	"Can only handle signed conversion without AVX512");
	assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
	SDValue Src = N->getOperand(0);
	if (Src.getValueType() == MVT::v2f64) {
	SDValue Idx = DAG.getIntPtrConstant(0, dl);
	SDValue Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI
	: X86ISD::CVTTP2UI,
	dl, MVT::v4i32, Src);
	Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
	Results.push_back(Res);
	return;
	}
	if (Src.getValueType() == MVT::v2f32) {
	SDValue Idx = DAG.getIntPtrConstant(0, dl);
	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
	DAG.getUNDEF(MVT::v2f32));
	Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT
	: ISD::FP_TO_UINT, dl, MVT::v4i32, Res);
	Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
	Results.push_back(Res);
	return;
	}

	// The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
	// so early out here.
	return;
	}

	std::pair<SDValue,SDValue> Vals =
	FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /IsReplace=/ true);
	SDValue FIST = Vals.first, StackSlot = Vals.second;
	if (FIST.getNode()) {
	EVT VT = N->getValueType(0);
	// Return a load from the stack slot.
	if (StackSlot.getNode())
	Results.push_back(
	DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo()));
	else
	Results.push_back(FIST);
	}
	return;
	}
	case ISD::SINT_TO_FP: {
	assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
	SDValue Src = N->getOperand(0);
	if (N->getValueType(0) != MVT::v2f32 \|\| Src.getValueType() != MVT::v2i64)
	return;
	Results.push_back(DAG.getNode(X86ISD::CVTSI2P, dl, MVT::v4f32, Src));
	return;
	}
	case ISD::UINT_TO_FP: {
	assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
	EVT VT = N->getValueType(0);
	if (VT != MVT::v2f32)
	return;
	SDValue Src = N->getOperand(0);
	EVT SrcVT = Src.getValueType();
	if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
	Results.push_back(DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v4f32, Src));
	return;
	}
	if (SrcVT != MVT::v2i32)
	return;
	SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
	SDValue VBias =
	DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
	SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
	DAG.getBitcast(MVT::v2i64, VBias));
	Or = DAG.getBitcast(MVT::v2f64, Or);
	// TODO: Are there any fast-math-flags to propagate here?
	SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
	Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
	return;
	}
	case ISD::FP_ROUND: {
	if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
	return;
	SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
	Results.push_back(V);
	return;
	}
	case ISD::FP_EXTEND: {
	// Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
	// No other ValueType for FP_EXTEND should reach this point.
	assert(N->getValueType(0) == MVT::v2f32 &&
	"Do not know how to legalize this Node");
	return;
	}
	case ISD::INTRINSIC_W_CHAIN: {
	unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
	switch (IntNo) {
	default : llvm_unreachable("Do not know how to custom type "
	"legalize this intrinsic operation!");
	case Intrinsic::x86_rdtsc:
	return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
	Results);
	case Intrinsic::x86_rdtscp:
	return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
	Results);
	case Intrinsic::x86_rdpmc:
	return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);

	case Intrinsic::x86_xgetbv:
	return getExtendedControlRegister(N, dl, DAG, Subtarget, Results);
	}
	}
	case ISD::INTRINSIC_WO_CHAIN: {
	if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), Subtarget, DAG))
	Results.push_back(V);
	return;
	}
	case ISD::READCYCLECOUNTER: {
	return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
	Results);
	}
	case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
	EVT T = N->getValueType(0);
	assert((T == MVT::i64 \|\| T == MVT::i128) && "can only expand cmpxchg pair");
	bool Regs64bit = T == MVT::i128;
	MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
	SDValue cpInL, cpInH;
	cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
	DAG.getConstant(0, dl, HalfT));
	cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
	DAG.getConstant(1, dl, HalfT));
	cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
	Regs64bit ? X86::RAX : X86::EAX,
	cpInL, SDValue());
	cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
	Regs64bit ? X86::RDX : X86::EDX,
	cpInH, cpInL.getValue(1));
	SDValue swapInL, swapInH;
	swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
	DAG.getConstant(0, dl, HalfT));
	swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
	DAG.getConstant(1, dl, HalfT));
	swapInH =
	DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
	swapInH, cpInH.getValue(1));
	// If the current function needs the base pointer, RBX,
	// we shouldn't use cmpxchg directly.
	// Indeed the lowering of that instruction will clobber
	// that register and since RBX will be a reserved register
	// the register allocator will not make sure its value will
	// be properly saved and restored around this live-range.
	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
	SDValue Result;
	SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
	unsigned BasePtr = TRI->getBaseRegister();
	MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
	if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
	(BasePtr == X86::RBX \|\| BasePtr == X86::EBX)) {
	// ISel prefers the LCMPXCHG64 variant.
	// If that assert breaks, that means it is not the case anymore,
	// and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
	// not just EBX. This is a matter of accepting i64 input for that
	// pseudo, and restoring into the register of the right wide
	// in expand pseudo. Everything else should just work.
	assert(((Regs64bit == (BasePtr == X86::RBX)) \|\| BasePtr == X86::EBX) &&
	"Saving only half of the RBX");
	unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
	: X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
	SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
	Regs64bit ? X86::RBX : X86::EBX,
	HalfT, swapInH.getValue(1));
	SDValue Ops[] = {/Chain/ RBXSave.getValue(1), N->getOperand(1), swapInL,
	RBXSave,
	/Glue/ RBXSave.getValue(2)};
	Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
	} else {
	unsigned Opcode =
	Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
	swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
	Regs64bit ? X86::RBX : X86::EBX, swapInL,
	swapInH.getValue(1));
	SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
	swapInL.getValue(1)};
	Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
	}
	SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
	Regs64bit ? X86::RAX : X86::EAX,
	HalfT, Result.getValue(1));
	SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
	Regs64bit ? X86::RDX : X86::EDX,
	HalfT, cpOutL.getValue(2));
	SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};

	SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
	MVT::i32, cpOutH.getValue(2));
	SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
	Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));

	Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
	Results.push_back(Success);
	Results.push_back(EFLAGS.getValue(1));
	return;
	}
	case ISD::ATOMIC_SWAP:
	case ISD::ATOMIC_LOAD_ADD:
	case ISD::ATOMIC_LOAD_SUB:
	case ISD::ATOMIC_LOAD_AND:
	case ISD::ATOMIC_LOAD_OR:
	case ISD::ATOMIC_LOAD_XOR:
	case ISD::ATOMIC_LOAD_NAND:
	case ISD::ATOMIC_LOAD_MIN:
	case ISD::ATOMIC_LOAD_MAX:
	case ISD::ATOMIC_LOAD_UMIN:
	case ISD::ATOMIC_LOAD_UMAX:
	case ISD::ATOMIC_LOAD: {
	// Delegate to generic TypeLegalization. Situations we can really handle
	// should have already been dealt with by AtomicExpandPass.cpp.
	break;
	}
	case ISD::BITCAST: {
	assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
	EVT DstVT = N->getValueType(0);
	EVT SrcVT = N->getOperand(0)->getValueType(0);

	if (SrcVT != MVT::f64 \|\|
	(DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
	return;

	unsigned NumElts = DstVT.getVectorNumElements();
	EVT SVT = DstVT.getVectorElementType();
	EVT WiderVT = EVT::getVectorVT(DAG.getContext(), SVT, NumElts 2);
	SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
	MVT::v2f64, N->getOperand(0));
	SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);

	if (ExperimentalVectorWideningLegalization) {
	// If we are legalizing vectors by widening, we already have the desired
	// legal vector type, just return it.
	Results.push_back(ToVecInt);
	return;
	}

	SmallVector<SDValue, 8> Elts;
	for (unsigned i = 0, e = NumElts; i != e; ++i)
	Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
	ToVecInt, DAG.getIntPtrConstant(i, dl)));

	Results.push_back(DAG.getBuildVector(DstVT, dl, Elts));
	}
	}
	}

	const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
	switch ((X86ISD::NodeType)Opcode) {
	case X86ISD::FIRST_NUMBER: break;
	case X86ISD::BSF: return "X86ISD::BSF";
	case X86ISD::BSR: return "X86ISD::BSR";
	case X86ISD::SHLD: return "X86ISD::SHLD";
	case X86ISD::SHRD: return "X86ISD::SHRD";
	case X86ISD::FAND: return "X86ISD::FAND";
	case X86ISD::FANDN: return "X86ISD::FANDN";
	case X86ISD::FOR: return "X86ISD::FOR";
	case X86ISD::FXOR: return "X86ISD::FXOR";
	case X86ISD::FILD: return "X86ISD::FILD";
	case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
	case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
	case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
	case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
	case X86ISD::FLD: return "X86ISD::FLD";
	case X86ISD::FST: return "X86ISD::FST";
	case X86ISD::CALL: return "X86ISD::CALL";
	case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG";
	case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG";
	case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG";
	case X86ISD::BT: return "X86ISD::BT";
	case X86ISD::CMP: return "X86ISD::CMP";
	case X86ISD::COMI: return "X86ISD::COMI";
	case X86ISD::UCOMI: return "X86ISD::UCOMI";
	case X86ISD::CMPM: return "X86ISD::CMPM";
	case X86ISD::CMPMU: return "X86ISD::CMPMU";
	case X86ISD::CMPM_RND: return "X86ISD::CMPM_RND";
	case X86ISD::SETCC: return "X86ISD::SETCC";
	case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
	case X86ISD::FSETCC: return "X86ISD::FSETCC";
	case X86ISD::FSETCCM: return "X86ISD::FSETCCM";
	case X86ISD::FSETCCM_RND: return "X86ISD::FSETCCM_RND";
	case X86ISD::CMOV: return "X86ISD::CMOV";
	case X86ISD::BRCOND: return "X86ISD::BRCOND";
	case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
	case X86ISD::IRET: return "X86ISD::IRET";
	case X86ISD::REP_STOS: return "X86ISD::REP_STOS";
	case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";
	case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
	case X86ISD::Wrapper: return "X86ISD::Wrapper";
	case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP";
	case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q";
	case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W";
	case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D";
	case X86ISD::PEXTRB: return "X86ISD::PEXTRB";
	case X86ISD::PEXTRW: return "X86ISD::PEXTRW";
	case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
	case X86ISD::PINSRB: return "X86ISD::PINSRB";
	case X86ISD::PINSRW: return "X86ISD::PINSRW";
	case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
	case X86ISD::ANDNP: return "X86ISD::ANDNP";
	case X86ISD::BLENDI: return "X86ISD::BLENDI";
	case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND";
	case X86ISD::ADDUS: return "X86ISD::ADDUS";
	case X86ISD::SUBUS: return "X86ISD::SUBUS";
	case X86ISD::HADD: return "X86ISD::HADD";
	case X86ISD::HSUB: return "X86ISD::HSUB";
	case X86ISD::FHADD: return "X86ISD::FHADD";
	case X86ISD::FHSUB: return "X86ISD::FHSUB";
	case X86ISD::CONFLICT: return "X86ISD::CONFLICT";
	case X86ISD::FMAX: return "X86ISD::FMAX";
	case X86ISD::FMAXS: return "X86ISD::FMAXS";
	case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND";
	case X86ISD::FMAXS_RND: return "X86ISD::FMAX_RND";
	case X86ISD::FMIN: return "X86ISD::FMIN";
	case X86ISD::FMINS: return "X86ISD::FMINS";
	case X86ISD::FMIN_RND: return "X86ISD::FMIN_RND";
	case X86ISD::FMINS_RND: return "X86ISD::FMINS_RND";
	case X86ISD::FMAXC: return "X86ISD::FMAXC";
	case X86ISD::FMINC: return "X86ISD::FMINC";
	case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
	case X86ISD::FRSQRTS: return "X86ISD::FRSQRTS";
	case X86ISD::FRCP: return "X86ISD::FRCP";
	case X86ISD::FRCPS: return "X86ISD::FRCPS";
	case X86ISD::EXTRQI: return "X86ISD::EXTRQI";
	case X86ISD::INSERTQI: return "X86ISD::INSERTQI";
	case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
	case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR";
	case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
	case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP";
	case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP";
	case X86ISD::EH_SJLJ_SETUP_DISPATCH:
	return "X86ISD::EH_SJLJ_SETUP_DISPATCH";
	case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
	case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
	case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
	case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r";
	case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
	case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
	case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG";
	case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
	return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
	case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
	return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
	case X86ISD::LADD: return "X86ISD::LADD";
	case X86ISD::LSUB: return "X86ISD::LSUB";
	case X86ISD::LOR: return "X86ISD::LOR";
	case X86ISD::LXOR: return "X86ISD::LXOR";
	case X86ISD::LAND: return "X86ISD::LAND";
	case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
	case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
	case X86ISD::VZEXT: return "X86ISD::VZEXT";
	case X86ISD::VSEXT: return "X86ISD::VSEXT";
	case X86ISD::VTRUNC: return "X86ISD::VTRUNC";
	case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS";
	case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS";
	case X86ISD::VTRUNCSTORES: return "X86ISD::VTRUNCSTORES";
	case X86ISD::VTRUNCSTOREUS: return "X86ISD::VTRUNCSTOREUS";
	case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES";
	case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS";
	case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
	case X86ISD::VFPEXT_RND: return "X86ISD::VFPEXT_RND";
	case X86ISD::VFPEXTS_RND: return "X86ISD::VFPEXTS_RND";
	case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
	case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND";
	case X86ISD::VFPROUNDS_RND: return "X86ISD::VFPROUNDS_RND";
	case X86ISD::CVT2MASK: return "X86ISD::CVT2MASK";
	case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
	case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
	case X86ISD::VSHL: return "X86ISD::VSHL";
	case X86ISD::VSRL: return "X86ISD::VSRL";
	case X86ISD::VSRA: return "X86ISD::VSRA";
	case X86ISD::VSHLI: return "X86ISD::VSHLI";
	case X86ISD::VSRLI: return "X86ISD::VSRLI";
	case X86ISD::VSRAI: return "X86ISD::VSRAI";
	case X86ISD::VSRAV: return "X86ISD::VSRAV";
	case X86ISD::VROTLI: return "X86ISD::VROTLI";
	case X86ISD::VROTRI: return "X86ISD::VROTRI";
	case X86ISD::VPPERM: return "X86ISD::VPPERM";
	case X86ISD::CMPP: return "X86ISD::CMPP";
	case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ";
	case X86ISD::PCMPGT: return "X86ISD::PCMPGT";
	case X86ISD::PCMPEQM: return "X86ISD::PCMPEQM";
	case X86ISD::PCMPGTM: return "X86ISD::PCMPGTM";
	case X86ISD::ADD: return "X86ISD::ADD";
	case X86ISD::SUB: return "X86ISD::SUB";
	case X86ISD::ADC: return "X86ISD::ADC";
	case X86ISD::SBB: return "X86ISD::SBB";
	case X86ISD::SMUL: return "X86ISD::SMUL";
	case X86ISD::UMUL: return "X86ISD::UMUL";
	case X86ISD::SMUL8: return "X86ISD::SMUL8";
	case X86ISD::UMUL8: return "X86ISD::UMUL8";
	case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
	case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
	case X86ISD::INC: return "X86ISD::INC";
	case X86ISD::DEC: return "X86ISD::DEC";
	case X86ISD::OR: return "X86ISD::OR";
	case X86ISD::XOR: return "X86ISD::XOR";
	case X86ISD::AND: return "X86ISD::AND";
	case X86ISD::BEXTR: return "X86ISD::BEXTR";
	case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
	case X86ISD::MOVMSK: return "X86ISD::MOVMSK";
	case X86ISD::PTEST: return "X86ISD::PTEST";
	case X86ISD::TESTP: return "X86ISD::TESTP";
	case X86ISD::TESTM: return "X86ISD::TESTM";
	case X86ISD::TESTNM: return "X86ISD::TESTNM";
	case X86ISD::KORTEST: return "X86ISD::KORTEST";
	case X86ISD::KTEST: return "X86ISD::KTEST";
	case X86ISD::KSHIFTL: return "X86ISD::KSHIFTL";
	case X86ISD::KSHIFTR: return "X86ISD::KSHIFTR";
	case X86ISD::PACKSS: return "X86ISD::PACKSS";
	case X86ISD::PACKUS: return "X86ISD::PACKUS";
	case X86ISD::PALIGNR: return "X86ISD::PALIGNR";
	case X86ISD::VALIGN: return "X86ISD::VALIGN";
	case X86ISD::PSHUFD: return "X86ISD::PSHUFD";
	case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";
	case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW";
	case X86ISD::SHUFP: return "X86ISD::SHUFP";
	case X86ISD::SHUF128: return "X86ISD::SHUF128";
	case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS";
	case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD";
	case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS";
	case X86ISD::MOVLPS: return "X86ISD::MOVLPS";
	case X86ISD::MOVLPD: return "X86ISD::MOVLPD";
	case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP";
	case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP";
	case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP";
	case X86ISD::MOVSD: return "X86ISD::MOVSD";
	case X86ISD::MOVSS: return "X86ISD::MOVSS";
	case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
	case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
	case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
	case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
	case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST";
	case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT";
	case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV";
	case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI";
	case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
	case X86ISD::VPERMV: return "X86ISD::VPERMV";
	case X86ISD::VPERMV3: return "X86ISD::VPERMV3";
	case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3";
	case X86ISD::VPERMI: return "X86ISD::VPERMI";
	case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG";
	case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM";
	case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS";
	case X86ISD::VRANGE: return "X86ISD::VRANGE";
	case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
	case X86ISD::PMULDQ: return "X86ISD::PMULDQ";
	case X86ISD::PSADBW: return "X86ISD::PSADBW";
	case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW";
	case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
	case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
	case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
	case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";
	case X86ISD::MFENCE: return "X86ISD::MFENCE";
	case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";
	case X86ISD::SAHF: return "X86ISD::SAHF";
	case X86ISD::RDRAND: return "X86ISD::RDRAND";
	case X86ISD::RDSEED: return "X86ISD::RDSEED";
	case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW";
	case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD";
	case X86ISD::VPROT: return "X86ISD::VPROT";
	case X86ISD::VPROTI: return "X86ISD::VPROTI";
	case X86ISD::VPSHA: return "X86ISD::VPSHA";
	case X86ISD::VPSHL: return "X86ISD::VPSHL";
	case X86ISD::VPCOM: return "X86ISD::VPCOM";
	case X86ISD::VPCOMU: return "X86ISD::VPCOMU";
	case X86ISD::VPERMIL2: return "X86ISD::VPERMIL2";
	case X86ISD::FMADD: return "X86ISD::FMADD";
	case X86ISD::FMSUB: return "X86ISD::FMSUB";
	case X86ISD::FNMADD: return "X86ISD::FNMADD";
	case X86ISD::FNMSUB: return "X86ISD::FNMSUB";
	case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";
	case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";
	case X86ISD::FMADD_RND: return "X86ISD::FMADD_RND";
	case X86ISD::FNMADD_RND: return "X86ISD::FNMADD_RND";
	case X86ISD::FMSUB_RND: return "X86ISD::FMSUB_RND";
	case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND";
	case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND";
	case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND";
	case X86ISD::FMADDS1_RND: return "X86ISD::FMADDS1_RND";
	case X86ISD::FNMADDS1_RND: return "X86ISD::FNMADDS1_RND";
	case X86ISD::FMSUBS1_RND: return "X86ISD::FMSUBS1_RND";
	case X86ISD::FNMSUBS1_RND: return "X86ISD::FNMSUBS1_RND";
	case X86ISD::FMADDS3_RND: return "X86ISD::FMADDS3_RND";
	case X86ISD::FNMADDS3_RND: return "X86ISD::FNMADDS3_RND";
	case X86ISD::FMSUBS3_RND: return "X86ISD::FMSUBS3_RND";
	case X86ISD::FNMSUBS3_RND: return "X86ISD::FNMSUBS3_RND";
	case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H";
	case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L";
	case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";
	case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES";
	case X86ISD::VREDUCE: return "X86ISD::VREDUCE";
	case X86ISD::VREDUCES: return "X86ISD::VREDUCES";
	case X86ISD::VGETMANT: return "X86ISD::VGETMANT";
	case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS";
	case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI";
	case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI";
	case X86ISD::XTEST: return "X86ISD::XTEST";
	case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
	case X86ISD::EXPAND: return "X86ISD::EXPAND";
	case X86ISD::SELECT: return "X86ISD::SELECT";
	case X86ISD::SELECTS: return "X86ISD::SELECTS";
	case X86ISD::ADDSUB: return "X86ISD::ADDSUB";
	case X86ISD::RCP28: return "X86ISD::RCP28";
	case X86ISD::RCP28S: return "X86ISD::RCP28S";
	case X86ISD::EXP2: return "X86ISD::EXP2";
	case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
	case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S";
	case X86ISD::FADD_RND: return "X86ISD::FADD_RND";
	case X86ISD::FADDS_RND: return "X86ISD::FADDS_RND";
	case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND";
	case X86ISD::FSUBS_RND: return "X86ISD::FSUBS_RND";
	case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND";
	case X86ISD::FMULS_RND: return "X86ISD::FMULS_RND";
	case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND";
	case X86ISD::FDIVS_RND: return "X86ISD::FDIVS_RND";
	case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND";
	case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND";
	case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND";
	case X86ISD::FGETEXPS_RND: return "X86ISD::FGETEXPS_RND";
	case X86ISD::SCALEF: return "X86ISD::SCALEF";
	case X86ISD::SCALEFS: return "X86ISD::SCALEFS";
	case X86ISD::ADDS: return "X86ISD::ADDS";
	case X86ISD::SUBS: return "X86ISD::SUBS";
	case X86ISD::AVG: return "X86ISD::AVG";
	case X86ISD::MULHRS: return "X86ISD::MULHRS";
	case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND";
	case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND";
	case X86ISD::CVTTP2SI: return "X86ISD::CVTTP2SI";
	case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI";
	case X86ISD::CVTTP2SI_RND: return "X86ISD::CVTTP2SI_RND";
	case X86ISD::CVTTP2UI_RND: return "X86ISD::CVTTP2UI_RND";
	case X86ISD::CVTTS2SI_RND: return "X86ISD::CVTTS2SI_RND";
	case X86ISD::CVTTS2UI_RND: return "X86ISD::CVTTS2UI_RND";
	case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P";
	case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P";
	case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS";
	case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS";
	case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT";
	case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND";
	case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
	case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH";
	case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS";
	case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI";
	case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI";
	case X86ISD::CVTP2SI_RND: return "X86ISD::CVTP2SI_RND";
	case X86ISD::CVTP2UI_RND: return "X86ISD::CVTP2UI_RND";
	case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND";
	case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND";
	case X86ISD::LWPINS: return "X86ISD::LWPINS";
	case X86ISD::MGATHER: return "X86ISD::MGATHER";
	}
	return nullptr;
	}

	/// Return true if the addressing mode represented by AM is legal for this
	/// target, for a load/store of the specified type.
	bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
	const AddrMode &AM, Type *Ty,
	unsigned AS) const {
	// X86 supports extremely general addressing modes.
	CodeModel::Model M = getTargetMachine().getCodeModel();

	// X86 allows a sign-extended 32-bit immediate field as a displacement.
	if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
	return false;

	if (AM.BaseGV) {
	unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);

	// If a reference to this global requires an extra load, we can't fold it.
	if (isGlobalStubReference(GVFlags))
	return false;

	// If BaseGV requires a register for the PIC base, we cannot also have a
	// BaseReg specified.
	if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
	return false;

	// If lower 4G is not available, then we must use rip-relative addressing.
	if ((M != CodeModel::Small \|\| isPositionIndependent()) &&
	Subtarget.is64Bit() && (AM.BaseOffs \|\| AM.Scale > 1))
	return false;
	}

	switch (AM.Scale) {
	case 0:
	case 1:
	case 2:
	case 4:
	case 8:
	// These scales always work.
	break;
	case 3:
	case 5:
	case 9:
	// These scales are formed with basereg+scalereg. Only accept if there is
	// no basereg yet.
	if (AM.HasBaseReg)
	return false;
	break;
	default: // Other stuff never works.
	return false;
	}

	return true;
	}

	bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
	unsigned Bits = Ty->getScalarSizeInBits();

	// 8-bit shifts are always expensive, but versions with a scalar amount aren't
	// particularly cheaper than those without.
	if (Bits == 8)
	return false;

	// On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make
	// variable shifts just as cheap as scalar ones.
	if (Subtarget.hasInt256() && (Bits == 32 \|\| Bits == 64))
	return false;

	// Otherwise, it's significantly cheaper to shift by a scalar amount than by a
	// fully general vector.
	return true;
	}

	bool X86TargetLowering::isTruncateFree(Type Ty1, Type Ty2) const {
	if (!Ty1->isIntegerTy() \|\| !Ty2->isIntegerTy())
	return false;
	unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
	unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
	return NumBits1 > NumBits2;
	}

	bool X86TargetLowering::allowTruncateForTailCall(Type Ty1, Type Ty2) const {
	if (!Ty1->isIntegerTy() \|\| !Ty2->isIntegerTy())
	return false;

	if (!isTypeLegal(EVT::getEVT(Ty1)))
	return false;

	assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");

	// Assuming the caller doesn't have a zeroext or signext return parameter,
	// truncation all the way down to i1 is valid.
	return true;
	}

	bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
	return isInt<32>(Imm);
	}

	bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
	// Can also use sub to handle negated immediates.
	return isInt<32>(Imm);
	}

	bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
	if (!VT1.isInteger() \|\| !VT2.isInteger())
	return false;
	unsigned NumBits1 = VT1.getSizeInBits();
	unsigned NumBits2 = VT2.getSizeInBits();
	return NumBits1 > NumBits2;
	}

	bool X86TargetLowering::isZExtFree(Type Ty1, Type Ty2) const {
	// x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
	return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
	}

	bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
	// x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
	return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
	}

	bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
	EVT VT1 = Val.getValueType();
	if (isZExtFree(VT1, VT2))
	return true;

	if (Val.getOpcode() != ISD::LOAD)
	return false;

	if (!VT1.isSimple() \|\| !VT1.isInteger() \|\|
	!VT2.isSimple() \|\| !VT2.isInteger())
	return false;

	switch (VT1.getSimpleVT().SimpleTy) {
	default: break;
	case MVT::i8:
	case MVT::i16:
	case MVT::i32:
	// X86 has 8, 16, and 32-bit zero-extending loads.
	return true;
	}

	return false;
	}

	bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }

	bool
	X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
	if (!Subtarget.hasAnyFMA())
	return false;

	VT = VT.getScalarType();

	if (!VT.isSimple())
	return false;

	switch (VT.getSimpleVT().SimpleTy) {
	case MVT::f32:
	case MVT::f64:
	return true;
	default:
	break;
	}

	return false;
	}

	bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
	// i16 instructions are longer (0x66 prefix) and potentially slower.
	return !(VT1 == MVT::i32 && VT2 == MVT::i16);
	}

	/// Targets can use this to indicate that they only support some
	/// VECTOR_SHUFFLE operations, those with specific masks.
	/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
	/// are assumed to be legal.
	bool
	X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
	EVT VT) const {
	if (!VT.isSimple())
	return false;

	// Not for i1 vectors
	if (VT.getSimpleVT().getScalarType() == MVT::i1)
	return false;

	// Very little shuffling can be done for 64-bit vectors right now.
	if (VT.getSimpleVT().getSizeInBits() == 64)
	return false;

	// We only care that the types being shuffled are legal. The lowering can
	// handle any possible shuffle mask that results.
	return isTypeLegal(VT.getSimpleVT());
	}

	bool
	X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
	EVT VT) const {
	// Just delegate to the generic legality, clear masks aren't special.
	return isShuffleMaskLegal(Mask, VT);
	}

	//===----------------------------------------------------------------------===//
	// X86 Scheduler Hooks
	//===----------------------------------------------------------------------===//

	/// Utility function to emit xbegin specifying the start of an RTM region.
	static MachineBasicBlock emitXBegin(MachineInstr &MI, MachineBasicBlock MBB,
	const TargetInstrInfo *TII) {
	DebugLoc DL = MI.getDebugLoc();

	const BasicBlock *BB = MBB->getBasicBlock();
	MachineFunction::iterator I = ++MBB->getIterator();

	// For the v = xbegin(), we generate
	//
	// thisMBB:
	// xbegin sinkMBB
	//
	// mainMBB:
	// s0 = -1
	//
	// fallBB:
	// eax = # XABORT_DEF
	// s1 = eax
	//
	// sinkMBB:
	// v = phi(s0/mainBB, s1/fallBB)

	MachineBasicBlock *thisMBB = MBB;
	MachineFunction *MF = MBB->getParent();
	MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
	MF->insert(I, mainMBB);
	MF->insert(I, fallMBB);
	MF->insert(I, sinkMBB);

	// Transfer the remainder of BB and its successor edges to sinkMBB.
	sinkMBB->splice(sinkMBB->begin(), MBB,
	std::next(MachineBasicBlock::iterator(MI)), MBB->end());
	sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);

	MachineRegisterInfo &MRI = MF->getRegInfo();
	unsigned DstReg = MI.getOperand(0).getReg();
	const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
	unsigned mainDstReg = MRI.createVirtualRegister(RC);
	unsigned fallDstReg = MRI.createVirtualRegister(RC);

	// thisMBB:
	// xbegin fallMBB
	// # fallthrough to mainMBB
	// # abortion to fallMBB
	BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
	thisMBB->addSuccessor(mainMBB);
	thisMBB->addSuccessor(fallMBB);

	// mainMBB:
	// mainDstReg := -1
	BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
	BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
	mainMBB->addSuccessor(sinkMBB);

	// fallMBB:
	// ; pseudo instruction to model hardware's definition from XABORT
	// EAX := XABORT_DEF
	// fallDstReg := EAX
	BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
	BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
	.addReg(X86::EAX);
	fallMBB->addSuccessor(sinkMBB);

	// sinkMBB:
	// DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
	BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
	.addReg(mainDstReg).addMBB(mainMBB)
	.addReg(fallDstReg).addMBB(fallMBB);

	MI.eraseFromParent();
	return sinkMBB;
	}

	// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
	// or XMM0_V32I8 in AVX all of this code can be replaced with that
	// in the .td file.
	static MachineBasicBlock emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock BB,
	const TargetInstrInfo *TII) {
	unsigned Opc;
	switch (MI.getOpcode()) {
	default: llvm_unreachable("illegal opcode!");
	case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break;
	case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
	case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break;
	case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
	case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break;
	case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
	case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break;
	case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
	}

	DebugLoc dl = MI.getDebugLoc();
	MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));

	unsigned NumArgs = MI.getNumOperands();
	for (unsigned i = 1; i < NumArgs; ++i) {
	MachineOperand &Op = MI.getOperand(i);
	if (!(Op.isReg() && Op.isImplicit()))
	MIB.add(Op);
	}
	if (MI.hasOneMemOperand())
	MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());

	BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
	.addReg(X86::XMM0);

	MI.eraseFromParent();
	return BB;
	}

	// FIXME: Custom handling because TableGen doesn't support multiple implicit
	// defs in an instruction pattern
	static MachineBasicBlock emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock BB,
	const TargetInstrInfo *TII) {
	unsigned Opc;
	switch (MI.getOpcode()) {
	default: llvm_unreachable("illegal opcode!");
	case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break;
	case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
	case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break;
	case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
	case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break;
	case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
	case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break;
	case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
	}

	DebugLoc dl = MI.getDebugLoc();
	MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));

	unsigned NumArgs = MI.getNumOperands(); // remove the results
	for (unsigned i = 1; i < NumArgs; ++i) {
	MachineOperand &Op = MI.getOperand(i);
	if (!(Op.isReg() && Op.isImplicit()))
	MIB.add(Op);
	}
	if (MI.hasOneMemOperand())
	MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());

	BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
	.addReg(X86::ECX);

	MI.eraseFromParent();
	return BB;
	}

	static MachineBasicBlock emitWRPKRU(MachineInstr &MI, MachineBasicBlock BB,
	const X86Subtarget &Subtarget) {
	DebugLoc dl = MI.getDebugLoc();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();

	// insert input VAL into EAX
	BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
	.addReg(MI.getOperand(0).getReg());
	// insert zero to ECX
	BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);

	// insert zero to EDX
	BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX);

	// insert WRPKRU instruction
	BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr));

	MI.eraseFromParent(); // The pseudo is gone now.
	return BB;
	}

	static MachineBasicBlock emitRDPKRU(MachineInstr &MI, MachineBasicBlock BB,
	const X86Subtarget &Subtarget) {
	DebugLoc dl = MI.getDebugLoc();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();

	// insert zero to ECX
	BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);

	// insert RDPKRU instruction
	BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr));
	BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
	.addReg(X86::EAX);

	MI.eraseFromParent(); // The pseudo is gone now.
	return BB;
	}

	static MachineBasicBlock emitMonitor(MachineInstr &MI, MachineBasicBlock BB,
	const X86Subtarget &Subtarget,
	unsigned Opc) {
	DebugLoc dl = MI.getDebugLoc();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	// Address into RAX/EAX, other two args into ECX, EDX.
	unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
	unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
	MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
	for (int i = 0; i < X86::AddrNumOperands; ++i)
	MIB.add(MI.getOperand(i));

	unsigned ValOps = X86::AddrNumOperands;
	BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
	.addReg(MI.getOperand(ValOps).getReg());
	BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
	.addReg(MI.getOperand(ValOps + 1).getReg());

	// The instruction doesn't actually take any operands though.
	BuildMI(*BB, MI, dl, TII->get(Opc));

	MI.eraseFromParent(); // The pseudo is gone now.
	return BB;
	}

	static MachineBasicBlock emitClzero(MachineInstr MI, MachineBasicBlock *BB,
	const X86Subtarget &Subtarget) {
	DebugLoc dl = MI->getDebugLoc();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	// Address into RAX/EAX
	unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
	unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
	MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
	for (int i = 0; i < X86::AddrNumOperands; ++i)
	MIB.add(MI->getOperand(i));

	// The instruction doesn't actually take any operands though.
	BuildMI(*BB, MI, dl, TII->get(X86::CLZEROr));

	MI->eraseFromParent(); // The pseudo is gone now.
	return BB;
	}



	MachineBasicBlock *
	X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	// Emit va_arg instruction on X86-64.

	// Operands to this pseudo-instruction:
	// 0 ) Output : destination address (reg)
	// 1-5) Input : va_list address (addr, i64mem)
	// 6 ) ArgSize : Size (in bytes) of vararg type
	// 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
	// 8 ) Align : Alignment of type
	// 9 ) EFLAGS (implicit-def)

	assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
	static_assert(X86::AddrNumOperands == 5,
	"VAARG_64 assumes 5 address operands");

	unsigned DestReg = MI.getOperand(0).getReg();
	MachineOperand &Base = MI.getOperand(1);
	MachineOperand &Scale = MI.getOperand(2);
	MachineOperand &Index = MI.getOperand(3);
	MachineOperand &Disp = MI.getOperand(4);
	MachineOperand &Segment = MI.getOperand(5);
	unsigned ArgSize = MI.getOperand(6).getImm();
	unsigned ArgMode = MI.getOperand(7).getImm();
	unsigned Align = MI.getOperand(8).getImm();

	// Memory Reference
	assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
	MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
	MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();

	// Machine Information
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
	const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
	const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
	DebugLoc DL = MI.getDebugLoc();

	// struct va_list {
	// i32 gp_offset
	// i32 fp_offset
	// i64 overflow_area (address)
	// i64 reg_save_area (address)
	// }
	// sizeof(va_list) = 24
	// alignment(va_list) = 8

	unsigned TotalNumIntRegs = 6;
	unsigned TotalNumXMMRegs = 8;
	bool UseGPOffset = (ArgMode == 1);
	bool UseFPOffset = (ArgMode == 2);
	unsigned MaxOffset = TotalNumIntRegs * 8 +
	(UseFPOffset ? TotalNumXMMRegs * 16 : 0);

	/* Align ArgSize to a multiple of 8 */
	unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
	bool NeedsAlign = (Align > 8);

	MachineBasicBlock *thisMBB = MBB;
	MachineBasicBlock *overflowMBB;
	MachineBasicBlock *offsetMBB;
	MachineBasicBlock *endMBB;

	unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
	unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
	unsigned OffsetReg = 0;

	if (!UseGPOffset && !UseFPOffset) {
	// If we only pull from the overflow region, we don't create a branch.
	// We don't need to alter control flow.
	OffsetDestReg = 0; // unused
	OverflowDestReg = DestReg;

	offsetMBB = nullptr;
	overflowMBB = thisMBB;
	endMBB = thisMBB;
	} else {
	// First emit code to check if gp_offset (or fp_offset) is below the bound.
	// If so, pull the argument from reg_save_area. (branch to offsetMBB)
	// If not, pull from overflow_area. (branch to overflowMBB)
	//
	// thisMBB
	// \| .
	// \| .
	// offsetMBB overflowMBB
	// \| .
	// \| .
	// endMBB

	// Registers for the PHI in endMBB
	OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
	OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);

	const BasicBlock *LLVM_BB = MBB->getBasicBlock();
	MachineFunction *MF = MBB->getParent();
	overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
	offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
	endMBB = MF->CreateMachineBasicBlock(LLVM_BB);

	MachineFunction::iterator MBBIter = ++MBB->getIterator();

	// Insert the new basic blocks
	MF->insert(MBBIter, offsetMBB);
	MF->insert(MBBIter, overflowMBB);
	MF->insert(MBBIter, endMBB);

	// Transfer the remainder of MBB and its successor edges to endMBB.
	endMBB->splice(endMBB->begin(), thisMBB,
	std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
	endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);

	// Make offsetMBB and overflowMBB successors of thisMBB
	thisMBB->addSuccessor(offsetMBB);
	thisMBB->addSuccessor(overflowMBB);

	// endMBB is a successor of both offsetMBB and overflowMBB
	offsetMBB->addSuccessor(endMBB);
	overflowMBB->addSuccessor(endMBB);

	// Load the offset value into a register
	OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
	BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
	.add(Base)
	.add(Scale)
	.add(Index)
	.addDisp(Disp, UseFPOffset ? 4 : 0)
	.add(Segment)
	.setMemRefs(MMOBegin, MMOEnd);

	// Check if there is enough room left to pull this argument.
	BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
	.addReg(OffsetReg)
	.addImm(MaxOffset + 8 - ArgSizeA8);

	// Branch to "overflowMBB" if offset >= max
	// Fall through to "offsetMBB" otherwise
	BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
	.addMBB(overflowMBB);
	}

	// In offsetMBB, emit code to use the reg_save_area.
	if (offsetMBB) {
	assert(OffsetReg != 0);

	// Read the reg_save_area address.
	unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
	BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
	.add(Base)
	.add(Scale)
	.add(Index)
	.addDisp(Disp, 16)
	.add(Segment)
	.setMemRefs(MMOBegin, MMOEnd);

	// Zero-extend the offset
	unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
	BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
	.addImm(0)
	.addReg(OffsetReg)
	.addImm(X86::sub_32bit);

	// Add the offset to the reg_save_area to get the final address.
	BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
	.addReg(OffsetReg64)
	.addReg(RegSaveReg);

	// Compute the offset for the next argument
	unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
	BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
	.addReg(OffsetReg)
	.addImm(UseFPOffset ? 16 : 8);

	// Store it back into the va_list.
	BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
	.add(Base)
	.add(Scale)
	.add(Index)
	.addDisp(Disp, UseFPOffset ? 4 : 0)
	.add(Segment)
	.addReg(NextOffsetReg)
	.setMemRefs(MMOBegin, MMOEnd);

	// Jump to endMBB
	BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
	.addMBB(endMBB);
	}

	//
	// Emit code to use overflow area
	//

	// Load the overflow_area address into a register.
	unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
	BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
	.add(Base)
	.add(Scale)
	.add(Index)
	.addDisp(Disp, 8)
	.add(Segment)
	.setMemRefs(MMOBegin, MMOEnd);

	// If we need to align it, do so. Otherwise, just copy the address
	// to OverflowDestReg.
	if (NeedsAlign) {
	// Align the overflow address
	assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");
	unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);

	// aligned_addr = (addr + (align-1)) & ~(align-1)
	BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
	.addReg(OverflowAddrReg)
	.addImm(Align-1);

	BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
	.addReg(TmpReg)
	.addImm(~(uint64_t)(Align-1));
	} else {
	BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
	.addReg(OverflowAddrReg);
	}

	// Compute the next overflow address after this argument.
	// (the overflow address should be kept 8-byte aligned)
	unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
	BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
	.addReg(OverflowDestReg)
	.addImm(ArgSizeA8);

	// Store the new overflow address.
	BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
	.add(Base)
	.add(Scale)
	.add(Index)
	.addDisp(Disp, 8)
	.add(Segment)
	.addReg(NextAddrReg)
	.setMemRefs(MMOBegin, MMOEnd);

	// If we branched, emit the PHI to the front of endMBB.
	if (offsetMBB) {
	BuildMI(*endMBB, endMBB->begin(), DL,
	TII->get(X86::PHI), DestReg)
	.addReg(OffsetDestReg).addMBB(offsetMBB)
	.addReg(OverflowDestReg).addMBB(overflowMBB);
	}

	// Erase the pseudo instruction
	MI.eraseFromParent();

	return endMBB;
	}

	MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
	MachineInstr &MI, MachineBasicBlock *MBB) const {
	// Emit code to save XMM registers to the stack. The ABI says that the
	// number of registers to save is given in %al, so it's theoretically
	// possible to do an indirect jump trick to avoid saving all of them,
	// however this code takes a simpler approach and just executes all
	// of the stores if %al is non-zero. It's less code, and it's probably
	// easier on the hardware branch predictor, and stores aren't all that
	// expensive anyway.

	// Create the new basic blocks. One block contains all the XMM stores,
	// and one block is the final destination regardless of whether any
	// stores were performed.
	const BasicBlock *LLVM_BB = MBB->getBasicBlock();
	MachineFunction *F = MBB->getParent();
	MachineFunction::iterator MBBIter = ++MBB->getIterator();
	MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
	F->insert(MBBIter, XMMSaveMBB);
	F->insert(MBBIter, EndMBB);

	// Transfer the remainder of MBB and its successor edges to EndMBB.
	EndMBB->splice(EndMBB->begin(), MBB,
	std::next(MachineBasicBlock::iterator(MI)), MBB->end());
	EndMBB->transferSuccessorsAndUpdatePHIs(MBB);

	// The original block will now fall through to the XMM save block.
	MBB->addSuccessor(XMMSaveMBB);
	// The XMMSaveMBB will fall through to the end block.
	XMMSaveMBB->addSuccessor(EndMBB);

	// Now add the instructions.
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();

	unsigned CountReg = MI.getOperand(0).getReg();
	int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
	int64_t VarArgsFPOffset = MI.getOperand(2).getImm();

	if (!Subtarget.isCallingConvWin64(F->getFunction()->getCallingConv())) {
	// If %al is 0, branch around the XMM save block.
	BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
	BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
	MBB->addSuccessor(EndMBB);
	}

	// Make sure the last operand is EFLAGS, which gets clobbered by the branch
	// that was just emitted, but clearly shouldn't be "saved".
	assert((MI.getNumOperands() <= 3 \|\|
	!MI.getOperand(MI.getNumOperands() - 1).isReg() \|\|
	MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
	"Expected last argument to be EFLAGS");
	unsigned MOVOpc = Subtarget.hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
	// In the XMM save block, save all the XMM argument registers.
	for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
	int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
	MachineMemOperand *MMO = F->getMachineMemOperand(
	MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
	MachineMemOperand::MOStore,
	/Size=/16, /Align=/16);
	BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
	.addFrameIndex(RegSaveFrameIndex)
	.addImm(/Scale=/1)
	.addReg(/IndexReg=/0)
	.addImm(/Disp=/Offset)
	.addReg(/Segment=/0)
	.addReg(MI.getOperand(i).getReg())
	.addMemOperand(MMO);
	}

	MI.eraseFromParent(); // The pseudo instruction is gone now.

	return EndMBB;
	}

	// The EFLAGS operand of SelectItr might be missing a kill marker
	// because there were multiple uses of EFLAGS, and ISel didn't know
	// which to mark. Figure out whether SelectItr should have had a
	// kill marker, and set it if it should. Returns the correct kill
	// marker value.
	static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
	MachineBasicBlock* BB,
	const TargetRegisterInfo* TRI) {
	// Scan forward through BB for a use/def of EFLAGS.
	MachineBasicBlock::iterator miI(std::next(SelectItr));
	for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
	const MachineInstr& mi = *miI;
	if (mi.readsRegister(X86::EFLAGS))
	return false;
	if (mi.definesRegister(X86::EFLAGS))
	break; // Should have kill-flag - update below.
	}

	// If we hit the end of the block, check whether EFLAGS is live into a
	// successor.
	if (miI == BB->end()) {
	for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
	sEnd = BB->succ_end();
	sItr != sEnd; ++sItr) {
	MachineBasicBlock* succ = *sItr;
	if (succ->isLiveIn(X86::EFLAGS))
	return false;
	}
	}

	// We found a def, or hit the end of the basic block and EFLAGS wasn't live
	// out. SelectMI should have a kill flag on EFLAGS.
	SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
	return true;
	}

	// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
	// together with other CMOV pseudo-opcodes into a single basic-block with
	// conditional jump around it.
	static bool isCMOVPseudo(MachineInstr &MI) {
	switch (MI.getOpcode()) {
	case X86::CMOV_FR32:
	case X86::CMOV_FR64:
	case X86::CMOV_GR8:
	case X86::CMOV_GR16:
	case X86::CMOV_GR32:
	case X86::CMOV_RFP32:
	case X86::CMOV_RFP64:
	case X86::CMOV_RFP80:
	case X86::CMOV_V2F64:
	case X86::CMOV_V2I64:
	case X86::CMOV_V4F32:
	case X86::CMOV_V4F64:
	case X86::CMOV_V4I64:
	case X86::CMOV_V16F32:
	case X86::CMOV_V8F32:
	case X86::CMOV_V8F64:
	case X86::CMOV_V8I64:
	case X86::CMOV_V8I1:
	case X86::CMOV_V16I1:
	case X86::CMOV_V32I1:
	case X86::CMOV_V64I1:
	return true;

	default:
	return false;
	}
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();

	// To "insert" a SELECT_CC instruction, we actually have to insert the
	// diamond control-flow pattern. The incoming instruction knows the
	// destination vreg to set, the condition code register to branch on, the
	// true/false values to select between, and a branch opcode to use.
	const BasicBlock *LLVM_BB = BB->getBasicBlock();
	MachineFunction::iterator It = ++BB->getIterator();

	// thisMBB:
	// ...
	// TrueVal = ...
	// cmpTY ccX, r1, r2
	// bCC copy1MBB
	// fallthrough --> copy0MBB
	MachineBasicBlock *thisMBB = BB;
	MachineFunction *F = BB->getParent();

	// This code lowers all pseudo-CMOV instructions. Generally it lowers these
	// as described above, by inserting a BB, and then making a PHI at the join
	// point to select the true and false operands of the CMOV in the PHI.
	//
	// The code also handles two different cases of multiple CMOV opcodes
	// in a row.
	//
	// Case 1:
	// In this case, there are multiple CMOVs in a row, all which are based on
	// the same condition setting (or the exact opposite condition setting).
	// In this case we can lower all the CMOVs using a single inserted BB, and
	// then make a number of PHIs at the join point to model the CMOVs. The only
	// trickiness here, is that in a case like:
	//
	// t2 = CMOV cond1 t1, f1
	// t3 = CMOV cond1 t2, f2
	//
	// when rewriting this into PHIs, we have to perform some renaming on the
	// temps since you cannot have a PHI operand refer to a PHI result earlier
	// in the same block. The "simple" but wrong lowering would be:
	//
	// t2 = PHI t1(BB1), f1(BB2)
	// t3 = PHI t2(BB1), f2(BB2)
	//
	// but clearly t2 is not defined in BB1, so that is incorrect. The proper
	// renaming is to note that on the path through BB1, t2 is really just a
	// copy of t1, and do that renaming, properly generating:
	//
	// t2 = PHI t1(BB1), f1(BB2)
	// t3 = PHI t1(BB1), f2(BB2)
	//
	// Case 2, we lower cascaded CMOVs such as
	//
	// (CMOV (CMOV F, T, cc1), T, cc2)
	//
	// to two successive branches. For that, we look for another CMOV as the
	// following instruction.
	//
	// Without this, we would add a PHI between the two jumps, which ends up
	// creating a few copies all around. For instance, for
	//
	// (sitofp (zext (fcmp une)))
	//
	// we would generate:
	//
	// ucomiss %xmm1, %xmm0
	// movss <1.0f>, %xmm0
	// movaps %xmm0, %xmm1
	// jne .LBB5_2
	// xorps %xmm1, %xmm1
	// .LBB5_2:
	// jp .LBB5_4
	// movaps %xmm1, %xmm0
	// .LBB5_4:
	// retq
	//
	// because this custom-inserter would have generated:
	//
	// A
	// \| \
	// \| B
	// \| /
	// C
	// \| \
	// \| D
	// \| /
	// E
	//
	// A: X = ...; Y = ...
	// B: empty
	// C: Z = PHI [X, A], [Y, B]
	// D: empty
	// E: PHI [X, C], [Z, D]
	//
	// If we lower both CMOVs in a single step, we can instead generate:
	//
	// A
	// \| \
	// \| C
	// \| /\|
	// \|/ \|
	// \| \|
	// \| D
	// \| /
	// E
	//
	// A: X = ...; Y = ...
	// D: empty
	// E: PHI [X, A], [X, C], [Y, D]
	//
	// Which, in our sitofp/fcmp example, gives us something like:
	//
	// ucomiss %xmm1, %xmm0
	// movss <1.0f>, %xmm0
	// jne .LBB5_4
	// jp .LBB5_4
	// xorps %xmm0, %xmm0
	// .LBB5_4:
	// retq
	//
	MachineInstr *CascadedCMOV = nullptr;
	MachineInstr *LastCMOV = &MI;
	X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
	X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
	MachineBasicBlock::iterator NextMIIt =
	std::next(MachineBasicBlock::iterator(MI));

	// Check for case 1, where there are multiple CMOVs with the same condition
	// first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
	// number of jumps the most.

	if (isCMOVPseudo(MI)) {
	// See if we have a string of CMOVS with the same condition.
	while (NextMIIt != BB->end() && isCMOVPseudo(*NextMIIt) &&
	(NextMIIt->getOperand(3).getImm() == CC \|\|
	NextMIIt->getOperand(3).getImm() == OppCC)) {
	LastCMOV = &*NextMIIt;
	++NextMIIt;
	}
	}

	// This checks for case 2, but only do this if we didn't already find
	// case 1, as indicated by LastCMOV == MI.
	if (LastCMOV == &MI && NextMIIt != BB->end() &&
	NextMIIt->getOpcode() == MI.getOpcode() &&
	NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
	NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
	NextMIIt->getOperand(1).isKill()) {
	CascadedCMOV = &*NextMIIt;
	}

	MachineBasicBlock *jcc1MBB = nullptr;

	// If we have a cascaded CMOV, we lower it to two successive branches to
	// the same block. EFLAGS is used by both, so mark it as live in the second.
	if (CascadedCMOV) {
	jcc1MBB = F->CreateMachineBasicBlock(LLVM_BB);
	F->insert(It, jcc1MBB);
	jcc1MBB->addLiveIn(X86::EFLAGS);
	}

	MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
	F->insert(It, copy0MBB);
	F->insert(It, sinkMBB);

	// If the EFLAGS register isn't dead in the terminator, then claim that it's
	// live into the sink and copy blocks.
	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

	MachineInstr *LastEFLAGSUser = CascadedCMOV ? CascadedCMOV : LastCMOV;
	if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) &&
	!checkAndUpdateEFLAGSKill(LastEFLAGSUser, BB, TRI)) {
	copy0MBB->addLiveIn(X86::EFLAGS);
	sinkMBB->addLiveIn(X86::EFLAGS);
	}

	// Transfer the remainder of BB and its successor edges to sinkMBB.
	sinkMBB->splice(sinkMBB->begin(), BB,
	std::next(MachineBasicBlock::iterator(LastCMOV)), BB->end());
	sinkMBB->transferSuccessorsAndUpdatePHIs(BB);

	// Add the true and fallthrough blocks as its successors.
	if (CascadedCMOV) {
	// The fallthrough block may be jcc1MBB, if we have a cascaded CMOV.
	BB->addSuccessor(jcc1MBB);

	// In that case, jcc1MBB will itself fallthrough the copy0MBB, and
	// jump to the sinkMBB.
	jcc1MBB->addSuccessor(copy0MBB);
	jcc1MBB->addSuccessor(sinkMBB);
	} else {
	BB->addSuccessor(copy0MBB);
	}

	// The true block target of the first (or only) branch is always sinkMBB.
	BB->addSuccessor(sinkMBB);

	// Create the conditional branch instruction.
	unsigned Opc = X86::GetCondBranchFromCond(CC);
	BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);

	if (CascadedCMOV) {
	unsigned Opc2 = X86::GetCondBranchFromCond(
	(X86::CondCode)CascadedCMOV->getOperand(3).getImm());
	BuildMI(jcc1MBB, DL, TII->get(Opc2)).addMBB(sinkMBB);
	}

	// copy0MBB:
	// %FalseValue = ...
	// # fallthrough to sinkMBB
	copy0MBB->addSuccessor(sinkMBB);

	// sinkMBB:
	// %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
	// ...
	MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
	MachineBasicBlock::iterator MIItEnd =
	std::next(MachineBasicBlock::iterator(LastCMOV));
	MachineBasicBlock::iterator SinkInsertionPoint = sinkMBB->begin();
	DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
	MachineInstrBuilder MIB;

	// As we are creating the PHIs, we have to be careful if there is more than
	// one. Later CMOVs may reference the results of earlier CMOVs, but later
	// PHIs have to reference the individual true/false inputs from earlier PHIs.
	// That also means that PHI construction must work forward from earlier to
	// later, and that the code must maintain a mapping from earlier PHI's
	// destination registers, and the registers that went into the PHI.

	for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
	unsigned DestReg = MIIt->getOperand(0).getReg();
	unsigned Op1Reg = MIIt->getOperand(1).getReg();
	unsigned Op2Reg = MIIt->getOperand(2).getReg();

	// If this CMOV we are generating is the opposite condition from
	// the jump we generated, then we have to swap the operands for the
	// PHI that is going to be generated.
	if (MIIt->getOperand(3).getImm() == OppCC)
	std::swap(Op1Reg, Op2Reg);

	if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
	Op1Reg = RegRewriteTable[Op1Reg].first;

	if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
	Op2Reg = RegRewriteTable[Op2Reg].second;

	MIB = BuildMI(*sinkMBB, SinkInsertionPoint, DL,
	TII->get(X86::PHI), DestReg)
	.addReg(Op1Reg).addMBB(copy0MBB)
	.addReg(Op2Reg).addMBB(thisMBB);

	// Add this PHI to the rewrite table.
	RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
	}

	// If we have a cascaded CMOV, the second Jcc provides the same incoming
	// value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes).
	if (CascadedCMOV) {
	MIB.addReg(MI.getOperand(2).getReg()).addMBB(jcc1MBB);
	// Copy the PHI result to the register defined by the second CMOV.
	BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())),
	DL, TII->get(TargetOpcode::COPY),
	CascadedCMOV->getOperand(0).getReg())
	.addReg(MI.getOperand(0).getReg());
	CascadedCMOV->eraseFromParent();
	}

	// Now remove the CMOV(s).
	for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; )
	(MIIt++)->eraseFromParent();

	return sinkMBB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	// Combine the following atomic floating-point modification pattern:
	// a.store(reg OP a.load(acquire), release)
	// Transform them into:
	// OPss (%gpr), %xmm
	// movss %xmm, (%gpr)
	// Or sd equivalent for 64-bit operations.
	unsigned MOp, FOp;
	switch (MI.getOpcode()) {
	default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP");
	case X86::RELEASE_FADD32mr:
	FOp = X86::ADDSSrm;
	MOp = X86::MOVSSmr;
	break;
	case X86::RELEASE_FADD64mr:
	FOp = X86::ADDSDrm;
	MOp = X86::MOVSDmr;
	break;
	}
	const X86InstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();
	MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
	unsigned ValOpIdx = X86::AddrNumOperands;
	unsigned VSrc = MI.getOperand(ValOpIdx).getReg();
	MachineInstrBuilder MIB =
	BuildMI(*BB, MI, DL, TII->get(FOp),
	MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
	.addReg(VSrc);
	for (int i = 0; i < X86::AddrNumOperands; ++i) {
	MachineOperand &Operand = MI.getOperand(i);
	// Clear any kill flags on register operands as we'll create a second
	// instruction using the same address operands.
	if (Operand.isReg())
	Operand.setIsKill(false);
	MIB.add(Operand);
	}
	MachineInstr *FOpMI = MIB;
	MIB = BuildMI(*BB, MI, DL, TII->get(MOp));
	for (int i = 0; i < X86::AddrNumOperands; ++i)
	MIB.add(MI.getOperand(i));
	MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill);
	MI.eraseFromParent(); // The pseudo instruction is gone now.
	return BB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	MachineFunction *MF = BB->getParent();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();
	const BasicBlock *LLVM_BB = BB->getBasicBlock();

	assert(MF->shouldSplitStack());

	const bool Is64Bit = Subtarget.is64Bit();
	const bool IsLP64 = Subtarget.isTarget64BitLP64();

	const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
	const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;

	// BB:
	// ... [Till the alloca]
	// If stacklet is not large enough, jump to mallocMBB
	//
	// bumpMBB:
	// Allocate by subtracting from RSP
	// Jump to continueMBB
	//
	// mallocMBB:
	// Allocate by call to runtime
	//
	// continueMBB:
	// ...
	// [rest of original BB]
	//

	MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);

	MachineRegisterInfo &MRI = MF->getRegInfo();
	const TargetRegisterClass *AddrRegClass =
	getRegClassFor(getPointerTy(MF->getDataLayout()));

	unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
	bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
	tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
	SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
	sizeVReg = MI.getOperand(1).getReg(),
	physSPReg =
	IsLP64 \|\| Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;

	MachineFunction::iterator MBBIter = ++BB->getIterator();

	MF->insert(MBBIter, bumpMBB);
	MF->insert(MBBIter, mallocMBB);
	MF->insert(MBBIter, continueMBB);

	continueMBB->splice(continueMBB->begin(), BB,
	std::next(MachineBasicBlock::iterator(MI)), BB->end());
	continueMBB->transferSuccessorsAndUpdatePHIs(BB);

	// Add code to the main basic block to check if the stack limit has been hit,
	// and if so, jump to mallocMBB otherwise to bumpMBB.
	BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
	BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
	.addReg(tmpSPVReg).addReg(sizeVReg);
	BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
	.addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
	.addReg(SPLimitVReg);
	BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);

	// bumpMBB simply decreases the stack pointer, since we know the current
	// stacklet has enough space.
	BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
	.addReg(SPLimitVReg);
	BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
	.addReg(SPLimitVReg);
	BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);

	// Calls into a routine in libgcc to allocate more space from the heap.
	const uint32_t *RegMask =
	Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
	if (IsLP64) {
	BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
	.addReg(sizeVReg);
	BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
	.addExternalSymbol("__morestack_allocate_stack_space")
	.addRegMask(RegMask)
	.addReg(X86::RDI, RegState::Implicit)
	.addReg(X86::RAX, RegState::ImplicitDefine);
	} else if (Is64Bit) {
	BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
	.addReg(sizeVReg);
	BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
	.addExternalSymbol("__morestack_allocate_stack_space")
	.addRegMask(RegMask)
	.addReg(X86::EDI, RegState::Implicit)
	.addReg(X86::EAX, RegState::ImplicitDefine);
	} else {
	BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
	.addImm(12);
	BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
	BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
	.addExternalSymbol("__morestack_allocate_stack_space")
	.addRegMask(RegMask)
	.addReg(X86::EAX, RegState::ImplicitDefine);
	}

	if (!Is64Bit)
	BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
	.addImm(16);

	BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
	.addReg(IsLP64 ? X86::RAX : X86::EAX);
	BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);

	// Set up the CFG correctly.
	BB->addSuccessor(bumpMBB);
	BB->addSuccessor(mallocMBB);
	mallocMBB->addSuccessor(continueMBB);
	bumpMBB->addSuccessor(continueMBB);

	// Take care of the PHI nodes.
	BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
	MI.getOperand(0).getReg())
	.addReg(mallocPtrVReg)
	.addMBB(mallocMBB)
	.addReg(bumpSPPtrVReg)
	.addMBB(bumpMBB);

	// Delete the original pseudo instruction.
	MI.eraseFromParent();

	// And we're done.
	return continueMBB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	MachineFunction *MF = BB->getParent();
	const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
	MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
	DebugLoc DL = MI.getDebugLoc();

	assert(!isAsynchronousEHPersonality(
	classifyEHPersonality(MF->getFunction()->getPersonalityFn())) &&
	"SEH does not use catchret!");

	// Only 32-bit EH needs to worry about manually restoring stack pointers.
	if (!Subtarget.is32Bit())
	return BB;

	// C++ EH creates a new target block to hold the restore code, and wires up
	// the new block to the return destination with a normal JMP_4.
	MachineBasicBlock *RestoreMBB =
	MF->CreateMachineBasicBlock(BB->getBasicBlock());
	assert(BB->succ_size() == 1);
	MF->insert(std::next(BB->getIterator()), RestoreMBB);
	RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
	BB->addSuccessor(RestoreMBB);
	MI.getOperand(0).setMBB(RestoreMBB);

	auto RestoreMBBI = RestoreMBB->begin();
	BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
	BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
	return BB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	MachineFunction *MF = BB->getParent();
	const Constant *PerFn = MF->getFunction()->getPersonalityFn();
	bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
	// Only 32-bit SEH requires special handling for catchpad.
	if (IsSEH && Subtarget.is32Bit()) {
	const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();
	BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
	}
	MI.eraseFromParent();
	return BB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	// So, here we replace TLSADDR with the sequence:
	// adjust_stackdown -> TLSADDR -> adjust_stackup.
	// We need this because TLSADDR is lowered into calls
	// inside MC, therefore without the two markers shrink-wrapping
	// may push the prologue/epilogue pass them.
	const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction &MF = *BB->getParent();

	// Emit CALLSEQ_START right before the instruction.
	unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
	MachineInstrBuilder CallseqStart =
	BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
	BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);

	// Emit CALLSEQ_END right after the instruction.
	// We don't call erase from parent because we want to keep the
	// original instruction around.
	unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
	MachineInstrBuilder CallseqEnd =
	BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
	BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);

	return BB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	// This is pretty easy. We're taking the value that we received from
	// our load from the relocation, sticking it in either RDI (x86-64)
	// or EAX and doing an indirect call. The return value will then
	// be in the normal return register.
	MachineFunction *F = BB->getParent();
	const X86InstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();

	assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
	assert(MI.getOperand(3).isGlobal() && "This should be a global");

	// Get a register mask for the lowered call.
	// FIXME: The 32-bit calls have non-standard calling conventions. Use a
	// proper register mask.
	const uint32_t *RegMask =
	Subtarget.is64Bit() ?
	Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
	Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
	if (Subtarget.is64Bit()) {
	MachineInstrBuilder MIB =
	BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
	.addReg(X86::RIP)
	.addImm(0)
	.addReg(0)
	.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
	MI.getOperand(3).getTargetFlags())
	.addReg(0);
	MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
	addDirectMem(MIB, X86::RDI);
	MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
	} else if (!isPositionIndependent()) {
	MachineInstrBuilder MIB =
	BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
	.addReg(0)
	.addImm(0)
	.addReg(0)
	.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
	MI.getOperand(3).getTargetFlags())
	.addReg(0);
	MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
	addDirectMem(MIB, X86::EAX);
	MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
	} else {
	MachineInstrBuilder MIB =
	BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
	.addReg(TII->getGlobalBaseReg(F))
	.addImm(0)
	.addReg(0)
	.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
	MI.getOperand(3).getTargetFlags())
	.addReg(0);
	MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
	addDirectMem(MIB, X86::EAX);
	MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
	}

	MI.eraseFromParent(); // The pseudo instruction is gone now.
	return BB;
	}

	MachineBasicBlock *
	X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction *MF = MBB->getParent();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
	MachineRegisterInfo &MRI = MF->getRegInfo();

	const BasicBlock *BB = MBB->getBasicBlock();
	MachineFunction::iterator I = ++MBB->getIterator();

	// Memory Reference
	MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
	MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();

	unsigned DstReg;
	unsigned MemOpndSlot = 0;

	unsigned CurOp = 0;

	DstReg = MI.getOperand(CurOp++).getReg();
	const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
	assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
	(void)TRI;
	unsigned mainDstReg = MRI.createVirtualRegister(RC);
	unsigned restoreDstReg = MRI.createVirtualRegister(RC);

	MemOpndSlot = CurOp;

	MVT PVT = getPointerTy(MF->getDataLayout());
	assert((PVT == MVT::i64 \|\| PVT == MVT::i32) &&
	"Invalid Pointer Size!");

	// For v = setjmp(buf), we generate
	//
	// thisMBB:
	// buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
	// SjLjSetup restoreMBB
	//
	// mainMBB:
	// v_main = 0
	//
	// sinkMBB:
	// v = phi(main, restore)
	//
	// restoreMBB:
	// if base pointer being used, load it from frame
	// v_restore = 1

	MachineBasicBlock *thisMBB = MBB;
	MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
	MF->insert(I, mainMBB);
	MF->insert(I, sinkMBB);
	MF->push_back(restoreMBB);
	restoreMBB->setHasAddressTaken();

	MachineInstrBuilder MIB;

	// Transfer the remainder of BB and its successor edges to sinkMBB.
	sinkMBB->splice(sinkMBB->begin(), MBB,
	std::next(MachineBasicBlock::iterator(MI)), MBB->end());
	sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);

	// thisMBB:
	unsigned PtrStoreOpc = 0;
	unsigned LabelReg = 0;
	const int64_t LabelOffset = 1 * PVT.getStoreSize();
	bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
	!isPositionIndependent();

	// Prepare IP either in reg or imm.
	if (!UseImmLabel) {
	PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
	const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
	LabelReg = MRI.createVirtualRegister(PtrRC);
	if (Subtarget.is64Bit()) {
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
	.addReg(X86::RIP)
	.addImm(0)
	.addReg(0)
	.addMBB(restoreMBB)
	.addReg(0);
	} else {
	const X86InstrInfo XII = static_cast<const X86InstrInfo>(TII);
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
	.addReg(XII->getGlobalBaseReg(MF))
	.addImm(0)
	.addReg(0)
	.addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
	.addReg(0);
	}
	} else
	PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
	// Store IP
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
	for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
	if (i == X86::AddrDisp)
	MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
	else
	MIB.add(MI.getOperand(MemOpndSlot + i));
	}
	if (!UseImmLabel)
	MIB.addReg(LabelReg);
	else
	MIB.addMBB(restoreMBB);
	MIB.setMemRefs(MMOBegin, MMOEnd);
	// Setup
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
	.addMBB(restoreMBB);

	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	MIB.addRegMask(RegInfo->getNoPreservedMask());
	thisMBB->addSuccessor(mainMBB);
	thisMBB->addSuccessor(restoreMBB);

	// mainMBB:
	// EAX = 0
	BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
	mainMBB->addSuccessor(sinkMBB);

	// sinkMBB:
	BuildMI(*sinkMBB, sinkMBB->begin(), DL,
	TII->get(X86::PHI), DstReg)
	.addReg(mainDstReg).addMBB(mainMBB)
	.addReg(restoreDstReg).addMBB(restoreMBB);

	// restoreMBB:
	if (RegInfo->hasBasePointer(*MF)) {
	const bool Uses64BitFramePtr =
	Subtarget.isTarget64BitLP64() \|\| Subtarget.isTargetNaCl64();
	X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
	X86FI->setRestoreBasePointer(MF);
	unsigned FramePtr = RegInfo->getFrameRegister(*MF);
	unsigned BasePtr = RegInfo->getBaseRegister();
	unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
	addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
	FramePtr, true, X86FI->getRestoreBasePointerOffset())
	.setMIFlag(MachineInstr::FrameSetup);
	}
	BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
	BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
	restoreMBB->addSuccessor(sinkMBB);

	MI.eraseFromParent();
	return sinkMBB;
	}

	MachineBasicBlock *
	X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction *MF = MBB->getParent();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	MachineRegisterInfo &MRI = MF->getRegInfo();

	// Memory Reference
	MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
	MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();

	MVT PVT = getPointerTy(MF->getDataLayout());
	assert((PVT == MVT::i64 \|\| PVT == MVT::i32) &&
	"Invalid Pointer Size!");

	const TargetRegisterClass *RC =
	(PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
	unsigned Tmp = MRI.createVirtualRegister(RC);
	// Since FP is only updated here but NOT referenced, it's treated as GPR.
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
	unsigned SP = RegInfo->getStackRegister();

	MachineInstrBuilder MIB;

	const int64_t LabelOffset = 1 * PVT.getStoreSize();
	const int64_t SPOffset = 2 * PVT.getStoreSize();

	unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
	unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;

	// Reload FP
	MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
	for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
	MIB.add(MI.getOperand(i));
	MIB.setMemRefs(MMOBegin, MMOEnd);
	// Reload IP
	MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
	for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
	if (i == X86::AddrDisp)
	MIB.addDisp(MI.getOperand(i), LabelOffset);
	else
	MIB.add(MI.getOperand(i));
	}
	MIB.setMemRefs(MMOBegin, MMOEnd);
	// Reload SP
	MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
	for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
	if (i == X86::AddrDisp)
	MIB.addDisp(MI.getOperand(i), SPOffset);
	else
	MIB.add(MI.getOperand(i));
	}
	MIB.setMemRefs(MMOBegin, MMOEnd);
	// Jump
	BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);

	MI.eraseFromParent();
	return MBB;
	}

	void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
	MachineBasicBlock *MBB,
	MachineBasicBlock *DispatchBB,
	int FI) const {
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction *MF = MBB->getParent();
	MachineRegisterInfo *MRI = &MF->getRegInfo();
	const X86InstrInfo *TII = Subtarget.getInstrInfo();

	MVT PVT = getPointerTy(MF->getDataLayout());
	assert((PVT == MVT::i64 \|\| PVT == MVT::i32) && "Invalid Pointer Size!");

	unsigned Op = 0;
	unsigned VR = 0;

	bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
	!isPositionIndependent();

	if (UseImmLabel) {
	Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
	} else {
	const TargetRegisterClass *TRC =
	(PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
	VR = MRI->createVirtualRegister(TRC);
	Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;

	if (Subtarget.is64Bit())
	BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
	.addReg(X86::RIP)
	.addImm(1)
	.addReg(0)
	.addMBB(DispatchBB)
	.addReg(0);
	else
	BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
	.addReg(0) /* TII->getGlobalBaseReg(MF) */
	.addImm(1)
	.addReg(0)
	.addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
	.addReg(0);
	}

	MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
	addFrameReference(MIB, FI, 36);
	if (UseImmLabel)
	MIB.addMBB(DispatchBB);
	else
	MIB.addReg(VR);
	}

	MachineBasicBlock *
	X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction *MF = BB->getParent();
	MachineFrameInfo &MFI = MF->getFrameInfo();
	MachineRegisterInfo *MRI = &MF->getRegInfo();
	const X86InstrInfo *TII = Subtarget.getInstrInfo();
	int FI = MFI.getFunctionContextIndex();

	// Get a mapping of the call site numbers to all of the landing pads they're
	// associated with.
	DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
	unsigned MaxCSNum = 0;
	for (auto &MBB : *MF) {
	if (!MBB.isEHPad())
	continue;

	MCSymbol *Sym = nullptr;
	for (const auto &MI : MBB) {
	if (MI.isDebugValue())
	continue;

	assert(MI.isEHLabel() && "expected EH_LABEL");
	Sym = MI.getOperand(0).getMCSymbol();
	break;
	}

	if (!MF->hasCallSiteLandingPad(Sym))
	continue;

	for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
	CallSiteNumToLPad[CSI].push_back(&MBB);
	MaxCSNum = std::max(MaxCSNum, CSI);
	}
	}

	// Get an ordered list of the machine basic blocks for the jump table.
	std::vector<MachineBasicBlock *> LPadList;
	SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
	LPadList.reserve(CallSiteNumToLPad.size());

	for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
	for (auto &LP : CallSiteNumToLPad[CSI]) {
	LPadList.push_back(LP);
	InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
	}
	}

	assert(!LPadList.empty() &&
	"No landing pad destinations for the dispatch jump table!");

	// Create the MBBs for the dispatch code.

	// Shove the dispatch's address into the return slot in the function context.
	MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
	DispatchBB->setIsEHPad(true);

	MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
	BuildMI(TrapBB, DL, TII->get(X86::TRAP));
	DispatchBB->addSuccessor(TrapBB);

	MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
	DispatchBB->addSuccessor(DispContBB);

	// Insert MBBs.
	MF->push_back(DispatchBB);
	MF->push_back(DispContBB);
	MF->push_back(TrapBB);

	// Insert code into the entry block that creates and registers the function
	// context.
	SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);

	// Create the jump table and associated information
	MachineJumpTableInfo *JTI =
	MF->getOrCreateJumpTableInfo(getJumpTableEncoding());
	unsigned MJTI = JTI->createJumpTableIndex(LPadList);

	const X86RegisterInfo &RI = TII->getRegisterInfo();
	// Add a register mask with no preserved registers. This results in all
	// registers being marked as clobbered.
	if (RI.hasBasePointer(*MF)) {
	const bool FPIs64Bit =
	Subtarget.isTarget64BitLP64() \|\| Subtarget.isTargetNaCl64();
	X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
	MFI->setRestoreBasePointer(MF);

	unsigned FP = RI.getFrameRegister(*MF);
	unsigned BP = RI.getBaseRegister();
	unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
	addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
	MFI->getRestoreBasePointerOffset())
	.addRegMask(RI.getNoPreservedMask());
	} else {
	BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
	.addRegMask(RI.getNoPreservedMask());
	}

	unsigned IReg = MRI->createVirtualRegister(&X86::GR32RegClass);
	addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
	4);
	BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
	.addReg(IReg)
	.addImm(LPadList.size());
	BuildMI(DispatchBB, DL, TII->get(X86::JA_1)).addMBB(TrapBB);

	unsigned JReg = MRI->createVirtualRegister(&X86::GR32RegClass);
	BuildMI(DispContBB, DL, TII->get(X86::SUB32ri), JReg)
	.addReg(IReg)
	.addImm(1);
	BuildMI(DispContBB, DL,
	TII->get(Subtarget.is64Bit() ? X86::JMP64m : X86::JMP32m))
	.addReg(0)
	.addImm(Subtarget.is64Bit() ? 8 : 4)
	.addReg(JReg)
	.addJumpTableIndex(MJTI)
	.addReg(0);

	// Add the jump table entries as successors to the MBB.
	SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
	for (auto &LP : LPadList)
	if (SeenMBBs.insert(LP).second)
	DispContBB->addSuccessor(LP);

	// N.B. the order the invoke BBs are processed in doesn't matter here.
	SmallVector<MachineBasicBlock *, 64> MBBLPads;
	const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
	for (MachineBasicBlock *MBB : InvokeBBs) {
	// Remove the landing pad successor from the invoke block and replace it
	// with the new dispatch block.
	// Keep a copy of Successors since it's modified inside the loop.
	SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
	MBB->succ_rend());
	// FIXME: Avoid quadratic complexity.
	for (auto MBBS : Successors) {
	if (MBBS->isEHPad()) {
	MBB->removeSuccessor(MBBS);
	MBBLPads.push_back(MBBS);
	}
	}

	MBB->addSuccessor(DispatchBB);

	// Find the invoke call and mark all of the callee-saved registers as
	// 'implicit defined' so that they're spilled. This prevents code from
	// moving instructions to before the EH block, where they will never be
	// executed.
	for (auto &II : reverse(*MBB)) {
	if (!II.isCall())
	continue;

	DenseMap<unsigned, bool> DefRegs;
	for (auto &MOp : II.operands())
	if (MOp.isReg())
	DefRegs[MOp.getReg()] = true;

	MachineInstrBuilder MIB(*MF, &II);
	for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
	unsigned Reg = SavedRegs[RI];
	if (!DefRegs[Reg])
	MIB.addReg(Reg, RegState::ImplicitDefine \| RegState::Dead);
	}

	break;
	}
	}

	// Mark all former landing pads as non-landing pads. The dispatch is the only
	// landing pad now.
	for (auto &LP : MBBLPads)
	LP->setIsEHPad(false);

	// The instruction is gone now.
	MI.eraseFromParent();
	return BB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	MachineFunction *MF = BB->getParent();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();

	switch (MI.getOpcode()) {
	default: llvm_unreachable("Unexpected instr type to insert");
	case X86::TAILJMPd64:
	case X86::TAILJMPr64:
	case X86::TAILJMPm64:
	case X86::TAILJMPr64_REX:
	case X86::TAILJMPm64_REX:
	llvm_unreachable("TAILJMP64 would not be touched here.");
	case X86::TCRETURNdi64:
	case X86::TCRETURNri64:
	case X86::TCRETURNmi64:
	return BB;
	case X86::TLS_addr32:
	case X86::TLS_addr64:
	case X86::TLS_base_addr32:
	case X86::TLS_base_addr64:
	return EmitLoweredTLSAddr(MI, BB);
	case X86::CATCHRET:
	return EmitLoweredCatchRet(MI, BB);
	case X86::CATCHPAD:
	return EmitLoweredCatchPad(MI, BB);
	case X86::SEG_ALLOCA_32:
	case X86::SEG_ALLOCA_64:
	return EmitLoweredSegAlloca(MI, BB);
	case X86::TLSCall_32:
	case X86::TLSCall_64:
	return EmitLoweredTLSCall(MI, BB);
	case X86::CMOV_FR32:
	case X86::CMOV_FR64:
	case X86::CMOV_FR128:
	case X86::CMOV_GR8:
	case X86::CMOV_GR16:
	case X86::CMOV_GR32:
	case X86::CMOV_RFP32:
	case X86::CMOV_RFP64:
	case X86::CMOV_RFP80:
	case X86::CMOV_V2F64:
	case X86::CMOV_V2I64:
	case X86::CMOV_V4F32:
	case X86::CMOV_V4F64:
	case X86::CMOV_V4I64:
	case X86::CMOV_V16F32:
	case X86::CMOV_V8F32:
	case X86::CMOV_V8F64:
	case X86::CMOV_V8I64:
	case X86::CMOV_V8I1:
	case X86::CMOV_V16I1:
	case X86::CMOV_V32I1:
	case X86::CMOV_V64I1:
	return EmitLoweredSelect(MI, BB);

	case X86::RDFLAGS32:
	case X86::RDFLAGS64: {
	unsigned PushF =
	MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
	unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
	MachineInstr Push = BuildMI(BB, MI, DL, TII->get(PushF));
	// Permit reads of the FLAGS register without it being defined.
	// This intrinsic exists to read external processor state in flags, such as
	// the trap flag, interrupt flag, and direction flag, none of which are
	// modeled by the backend.
	Push->getOperand(2).setIsUndef();
	BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());

	MI.eraseFromParent(); // The pseudo is gone now.
	return BB;
	}

	case X86::WRFLAGS32:
	case X86::WRFLAGS64: {
	unsigned Push =
	MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
	unsigned PopF =
	MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
	BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
	BuildMI(*BB, MI, DL, TII->get(PopF));

	MI.eraseFromParent(); // The pseudo is gone now.
	return BB;
	}

	case X86::RELEASE_FADD32mr:
	case X86::RELEASE_FADD64mr:
	return EmitLoweredAtomicFP(MI, BB);

	case X86::FP32_TO_INT16_IN_MEM:
	case X86::FP32_TO_INT32_IN_MEM:
	case X86::FP32_TO_INT64_IN_MEM:
	case X86::FP64_TO_INT16_IN_MEM:
	case X86::FP64_TO_INT32_IN_MEM:
	case X86::FP64_TO_INT64_IN_MEM:
	case X86::FP80_TO_INT16_IN_MEM:
	case X86::FP80_TO_INT32_IN_MEM:
	case X86::FP80_TO_INT64_IN_MEM: {
	// Change the floating point control register to use "round towards zero"
	// mode when truncating to an integer value.
	int CWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
	addFrameReference(BuildMI(*BB, MI, DL,
	TII->get(X86::FNSTCW16m)), CWFrameIdx);

	// Load the old value of the high byte of the control word...
	unsigned OldCW =
	MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
	addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
	CWFrameIdx);

	// Set the high part to be round to zero...
	addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
	.addImm(0xC7F);

	// Reload the modified control word now...
	addFrameReference(BuildMI(*BB, MI, DL,
	TII->get(X86::FLDCW16m)), CWFrameIdx);

	// Restore the memory image of control word to original value
	addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
	.addReg(OldCW);

	// Get the X86 opcode to use.
	unsigned Opc;
	switch (MI.getOpcode()) {
	default: llvm_unreachable("illegal opcode!");
	case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
	case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
	case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
	case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
	case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
	case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
	case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
	case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
	case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
	}

	X86AddressMode AM = getAddressFromInstr(&MI, 0);
	addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
	.addReg(MI.getOperand(X86::AddrNumOperands).getReg());

	// Reload the original control word now.
	addFrameReference(BuildMI(*BB, MI, DL,
	TII->get(X86::FLDCW16m)), CWFrameIdx);

	MI.eraseFromParent(); // The pseudo instruction is gone now.
	return BB;
	}
	// String/text processing lowering.
	case X86::PCMPISTRM128REG:
	case X86::VPCMPISTRM128REG:
	case X86::PCMPISTRM128MEM:
	case X86::VPCMPISTRM128MEM:
	case X86::PCMPESTRM128REG:
	case X86::VPCMPESTRM128REG:
	case X86::PCMPESTRM128MEM:
	case X86::VPCMPESTRM128MEM:
	assert(Subtarget.hasSSE42() &&
	"Target must have SSE4.2 or AVX features enabled");
	return emitPCMPSTRM(MI, BB, Subtarget.getInstrInfo());

	// String/text processing lowering.
	case X86::PCMPISTRIREG:
	case X86::VPCMPISTRIREG:
	case X86::PCMPISTRIMEM:
	case X86::VPCMPISTRIMEM:
	case X86::PCMPESTRIREG:
	case X86::VPCMPESTRIREG:
	case X86::PCMPESTRIMEM:
	case X86::VPCMPESTRIMEM:
	assert(Subtarget.hasSSE42() &&
	"Target must have SSE4.2 or AVX features enabled");
	return emitPCMPSTRI(MI, BB, Subtarget.getInstrInfo());

	// Thread synchronization.
	case X86::MONITOR:
	return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
	case X86::MONITORX:
	return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);

	// Cache line zero
	case X86::CLZERO:
	return emitClzero(&MI, BB, Subtarget);

	// PKU feature
	case X86::WRPKRU:
	return emitWRPKRU(MI, BB, Subtarget);
	case X86::RDPKRU:
	return emitRDPKRU(MI, BB, Subtarget);
	// xbegin
	case X86::XBEGIN:
	return emitXBegin(MI, BB, Subtarget.getInstrInfo());

	case X86::VASTART_SAVE_XMM_REGS:
	return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);

	case X86::VAARG_64:
	return EmitVAARG64WithCustomInserter(MI, BB);

	case X86::EH_SjLj_SetJmp32:
	case X86::EH_SjLj_SetJmp64:
	return emitEHSjLjSetJmp(MI, BB);

	case X86::EH_SjLj_LongJmp32:
	case X86::EH_SjLj_LongJmp64:
	return emitEHSjLjLongJmp(MI, BB);

	case X86::Int_eh_sjlj_setup_dispatch:
	return EmitSjLjDispatchBlock(MI, BB);

	case TargetOpcode::STATEPOINT:
	// As an implementation detail, STATEPOINT shares the STACKMAP format at
	// this point in the process. We diverge later.
	return emitPatchPoint(MI, BB);

	case TargetOpcode::STACKMAP:
	case TargetOpcode::PATCHPOINT:
	return emitPatchPoint(MI, BB);

	case TargetOpcode::PATCHABLE_EVENT_CALL:
	// Do nothing here, handle in xray instrumentation pass.
	return BB;

	case X86::LCMPXCHG8B: {
	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
	// In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
	// requires a memory operand. If it happens that current architecture is
	// i686 and for current function we need a base pointer
	// - which is ESI for i686 - register allocator would not be able to
	// allocate registers for an address in form of X(%reg, %reg, Y)
	// - there never would be enough unreserved registers during regalloc
	// (without the need for base ptr the only option would be X(%edi, %esi, Y).
	// We are giving a hand to register allocator by precomputing the address in
	// a new vreg using LEA.

	// If it is not i686 or there is no base pointer - nothing to do here.
	if (!Subtarget.is32Bit() \|\| !TRI->hasBasePointer(*MF))
	return BB;

	// Even though this code does not necessarily needs the base pointer to
	// be ESI, we check for that. The reason: if this assert fails, there are
	// some changes happened in the compiler base pointer handling, which most
	// probably have to be addressed somehow here.
	assert(TRI->getBaseRegister() == X86::ESI &&
	"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
	"base pointer in mind");

	MachineRegisterInfo &MRI = MF->getRegInfo();
	MVT SPTy = getPointerTy(MF->getDataLayout());
	const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
	unsigned computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);

	X86AddressMode AM = getAddressFromInstr(&MI, 0);
	// Regalloc does not need any help when the memory operand of CMPXCHG8B
	// does not use index register.
	if (AM.IndexReg == X86::NoRegister)
	return BB;

	// After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
	// four operand definitions that are E[ABCD] registers. We skip them and
	// then insert the LEA.
	MachineBasicBlock::iterator MBBI(MI);
	while (MBBI->definesRegister(X86::EAX) \|\| MBBI->definesRegister(X86::EBX) \|\|
	MBBI->definesRegister(X86::ECX) \|\| MBBI->definesRegister(X86::EDX))
	--MBBI;
	addFullAddress(
	BuildMI(BB, MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);

	setDirectAddressInInstr(&MI, 0, computedAddrVReg);

	return BB;
	}
	case X86::LCMPXCHG16B:
	return BB;
	case X86::LCMPXCHG8B_SAVE_EBX:
	case X86::LCMPXCHG16B_SAVE_RBX: {
	unsigned BasePtr =
	MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
	if (!BB->isLiveIn(BasePtr))
	BB->addLiveIn(BasePtr);
	return BB;
	}
	}
	}

	//===----------------------------------------------------------------------===//
	// X86 Optimization Hooks
	//===----------------------------------------------------------------------===//

	void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
	KnownBits &Known,
	const APInt &DemandedElts,
	const SelectionDAG &DAG,
	unsigned Depth) const {
	unsigned BitWidth = Known.getBitWidth();
	unsigned Opc = Op.getOpcode();
	EVT VT = Op.getValueType();
	assert((Opc >= ISD::BUILTIN_OP_END \|\|
	Opc == ISD::INTRINSIC_WO_CHAIN \|\|
	Opc == ISD::INTRINSIC_W_CHAIN \|\|
	Opc == ISD::INTRINSIC_VOID) &&
	"Should use MaskedValueIsZero if you don't know whether Op"
	" is a target node!");

	Known.resetAll();
	switch (Opc) {
	default: break;
	case X86ISD::ADD:
	case X86ISD::SUB:
	case X86ISD::ADC:
	case X86ISD::SBB:
	case X86ISD::SMUL:
	case X86ISD::UMUL:
	case X86ISD::INC:
	case X86ISD::DEC:
	case X86ISD::OR:
	case X86ISD::XOR:
	case X86ISD::AND:
	// These nodes' second result is a boolean.
	if (Op.getResNo() == 0)
	break;
	LLVM_FALLTHROUGH;
	case X86ISD::SETCC:
	Known.Zero.setBitsFrom(1);
	break;
	case X86ISD::MOVMSK: {
	unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
	Known.Zero.setBitsFrom(NumLoBits);
	break;
	}
	case X86ISD::VSHLI:
	case X86ISD::VSRLI: {
	if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
	if (ShiftImm->getAPIntValue().uge(VT.getScalarSizeInBits())) {
	Known.setAllZero();
	break;
	}

	DAG.computeKnownBits(Op.getOperand(0), Known, Depth + 1);
	unsigned ShAmt = ShiftImm->getZExtValue();
	if (Opc == X86ISD::VSHLI) {
	Known.Zero <<= ShAmt;
	Known.One <<= ShAmt;
	// Low bits are known zero.
	Known.Zero.setLowBits(ShAmt);
	} else {
	Known.Zero.lshrInPlace(ShAmt);
	Known.One.lshrInPlace(ShAmt);
	// High bits are known zero.
	Known.Zero.setHighBits(ShAmt);
	}
	}
	break;
	}
	case X86ISD::VZEXT: {
	SDValue N0 = Op.getOperand(0);
	unsigned NumElts = VT.getVectorNumElements();

	EVT SrcVT = N0.getValueType();
	unsigned InNumElts = SrcVT.getVectorNumElements();
	unsigned InBitWidth = SrcVT.getScalarSizeInBits();
	assert(InNumElts >= NumElts && "Illegal VZEXT input");

	Known = KnownBits(InBitWidth);
	APInt DemandedSrcElts = APInt::getLowBitsSet(InNumElts, NumElts);
	DAG.computeKnownBits(N0, Known, DemandedSrcElts, Depth + 1);
	Known = Known.zext(BitWidth);
	Known.Zero.setBitsFrom(InBitWidth);
	break;
	}
	}
	}

	unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
	SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
	unsigned Depth) const {
	unsigned VTBits = Op.getScalarValueSizeInBits();
	unsigned Opcode = Op.getOpcode();
	switch (Opcode) {
	case X86ISD::SETCC_CARRY:
	// SETCC_CARRY sets the dest to ~0 for true or 0 for false.
	return VTBits;

	case X86ISD::VSEXT: {
	SDValue Src = Op.getOperand(0);
	unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
	Tmp += VTBits - Src.getScalarValueSizeInBits();
	return Tmp;
	}

	case X86ISD::VSHLI: {
	SDValue Src = Op.getOperand(0);
	unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
	APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
	if (ShiftVal.uge(VTBits))
	return VTBits; // Shifted all bits out --> zero.
	if (ShiftVal.uge(Tmp))
	return 1; // Shifted all sign bits out --> unknown.
	return Tmp - ShiftVal.getZExtValue();
	}

	case X86ISD::VSRAI: {
	SDValue Src = Op.getOperand(0);
	unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
	APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
	ShiftVal += Tmp;
	return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
	}

	case X86ISD::PCMPGT:
	case X86ISD::PCMPEQ:
	case X86ISD::CMPP:
	case X86ISD::VPCOM:
	case X86ISD::VPCOMU:
	// Vector compares return zero/all-bits result values.
	return VTBits;
	}

	// Fallback case.
	return 1;
	}

	/// Returns true (and the GlobalValue and the offset) if the node is a
	/// GlobalAddress + offset.
	bool X86TargetLowering::isGAPlusOffset(SDNode *N,
	const GlobalValue* &GA,
	int64_t &Offset) const {
	if (N->getOpcode() == X86ISD::Wrapper) {
	if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
	GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
	Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
	return true;
	}
	}
	return TargetLowering::isGAPlusOffset(N, GA, Offset);
	}

	// Attempt to match a combined shuffle mask against supported unary shuffle
	// instructions.
	// TODO: Investigate sharing more of this with shuffle lowering.
	static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
	bool AllowFloatDomain, bool AllowIntDomain,
	SDValue &V1, SDLoc &DL, SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
	unsigned NumMaskElts = Mask.size();
	unsigned MaskEltSize = MaskVT.getScalarSizeInBits();

	// Match against a ZERO_EXTEND_VECTOR_INREG/VZEXT instruction.
	// TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
	if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
	unsigned MaxScale = 64 / MaskEltSize;
	for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
	bool Match = true;
	unsigned NumDstElts = NumMaskElts / Scale;
	for (unsigned i = 0; i != NumDstElts && Match; ++i) {
	Match &= isUndefOrEqual(Mask[i * Scale], (int)i);
	Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
	}
	if (Match) {
	unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
	SrcVT = MVT::getVectorVT(MaskVT.getScalarType(), SrcSize / MaskEltSize);
	if (SrcVT != MaskVT)
	V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
	DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
	DstVT = MVT::getVectorVT(DstVT, NumDstElts);
	Shuffle = SrcVT != MaskVT ? unsigned(X86ISD::VZEXT)
	: unsigned(ISD::ZERO_EXTEND_VECTOR_INREG);
	return true;
	}
	}
	}

	// Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
	if (((MaskEltSize == 32) \|\| (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
	isUndefOrEqual(Mask[0], 0) &&
	isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
	Shuffle = X86ISD::VZEXT_MOVL;
	SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
	return true;
	}

	// Check if we have SSE3 which will let us use MOVDDUP etc. The
	// instructions are no slower than UNPCKLPD but has the option to
	// fold the input operand into even an unaligned memory load.
	if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
	if (isTargetShuffleEquivalent(Mask, {0, 0})) {
	Shuffle = X86ISD::MOVDDUP;
	SrcVT = DstVT = MVT::v2f64;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
	Shuffle = X86ISD::MOVSLDUP;
	SrcVT = DstVT = MVT::v4f32;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
	Shuffle = X86ISD::MOVSHDUP;
	SrcVT = DstVT = MVT::v4f32;
	return true;
	}
	}

	if (MaskVT.is256BitVector() && AllowFloatDomain) {
	assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
	if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
	Shuffle = X86ISD::MOVDDUP;
	SrcVT = DstVT = MVT::v4f64;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
	Shuffle = X86ISD::MOVSLDUP;
	SrcVT = DstVT = MVT::v8f32;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
	Shuffle = X86ISD::MOVSHDUP;
	SrcVT = DstVT = MVT::v8f32;
	return true;
	}
	}

	if (MaskVT.is512BitVector() && AllowFloatDomain) {
	assert(Subtarget.hasAVX512() &&
	"AVX512 required for 512-bit vector shuffles");
	if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
	Shuffle = X86ISD::MOVDDUP;
	SrcVT = DstVT = MVT::v8f64;
	return true;
	}
	if (isTargetShuffleEquivalent(
	Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
	Shuffle = X86ISD::MOVSLDUP;
	SrcVT = DstVT = MVT::v16f32;
	return true;
	}
	if (isTargetShuffleEquivalent(
	Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
	Shuffle = X86ISD::MOVSHDUP;
	SrcVT = DstVT = MVT::v16f32;
	return true;
	}
	}

	// Attempt to match against broadcast-from-vector.
	if (Subtarget.hasAVX2()) {
	SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
	if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
	SrcVT = DstVT = MaskVT;
	Shuffle = X86ISD::VBROADCAST;
	return true;
	}
	}

	return false;
	}

	// Attempt to match a combined shuffle mask against supported unary immediate
	// permute instructions.
	// TODO: Investigate sharing more of this with shuffle lowering.
	static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
	const APInt &Zeroable,
	bool AllowFloatDomain,
	bool AllowIntDomain,
	const X86Subtarget &Subtarget,
	unsigned &Shuffle, MVT &ShuffleVT,
	unsigned &PermuteImm) {
	unsigned NumMaskElts = Mask.size();
	unsigned InputSizeInBits = MaskVT.getSizeInBits();
	unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
	MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);

	bool ContainsZeros =
	llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });

	// Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
	if (!ContainsZeros && MaskScalarSizeInBits == 64) {
	// Check for lane crossing permutes.
	if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
	// PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
	if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
	Shuffle = X86ISD::VPERMI;
	ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
	PermuteImm = getV4X86ShuffleImm(Mask);
	return true;
	}
	if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
	SmallVector<int, 4> RepeatedMask;
	if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
	Shuffle = X86ISD::VPERMI;
	ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
	PermuteImm = getV4X86ShuffleImm(RepeatedMask);
	return true;
	}
	}
	} else if (AllowFloatDomain && Subtarget.hasAVX()) {
	// VPERMILPD can permute with a non-repeating shuffle.
	Shuffle = X86ISD::VPERMILPI;
	ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
	PermuteImm = 0;
	for (int i = 0, e = Mask.size(); i != e; ++i) {
	int M = Mask[i];
	if (M == SM_SentinelUndef)
	continue;
	assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
	PermuteImm \|= (M & 1) << i;
	}
	return true;
	}
	}

	// Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
	// AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
	// had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
	if ((MaskScalarSizeInBits == 64 \|\| MaskScalarSizeInBits == 32) &&
	!ContainsZeros && (AllowIntDomain \|\| Subtarget.hasAVX())) {
	SmallVector<int, 4> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
	// Narrow the repeated mask to create 32-bit element permutes.
	SmallVector<int, 4> WordMask = RepeatedMask;
	if (MaskScalarSizeInBits == 64)
	scaleShuffleMask(2, RepeatedMask, WordMask);

	Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
	ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
	ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
	PermuteImm = getV4X86ShuffleImm(WordMask);
	return true;
	}
	}

	// Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
	if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16) {
	SmallVector<int, 4> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
	ArrayRef<int> LoMask(Mask.data() + 0, 4);
	ArrayRef<int> HiMask(Mask.data() + 4, 4);

	// PSHUFLW: permute lower 4 elements only.
	if (isUndefOrInRange(LoMask, 0, 4) &&
	isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
	Shuffle = X86ISD::PSHUFLW;
	ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
	PermuteImm = getV4X86ShuffleImm(LoMask);
	return true;
	}

	// PSHUFHW: permute upper 4 elements only.
	if (isUndefOrInRange(HiMask, 4, 8) &&
	isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
	// Offset the HiMask so that we can create the shuffle immediate.
	int OffsetHiMask[4];
	for (int i = 0; i != 4; ++i)
	OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);

	Shuffle = X86ISD::PSHUFHW;
	ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
	PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
	return true;
	}
	}
	}

	// Attempt to match against byte/bit shifts.
	// FIXME: Add 512-bit support.
	if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
	int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
	MaskScalarSizeInBits, Mask,
	0, Zeroable, Subtarget);
	if (0 < ShiftAmt) {
	PermuteImm = (unsigned)ShiftAmt;
	return true;
	}
	}

	return false;
	}

	// Attempt to match a combined unary shuffle mask against supported binary
	// shuffle instructions.
	// TODO: Investigate sharing more of this with shuffle lowering.
	static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
	bool AllowFloatDomain, bool AllowIntDomain,
	SDValue &V1, SDValue &V2, SDLoc &DL,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	unsigned &Shuffle, MVT &ShuffleVT,
	bool IsUnary) {
	unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();

	if (MaskVT.is128BitVector()) {
	if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) {
	V2 = V1;
	Shuffle = X86ISD::MOVLHPS;
	ShuffleVT = MVT::v4f32;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) {
	V2 = V1;
	Shuffle = X86ISD::MOVHLPS;
	ShuffleVT = MVT::v4f32;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
	(AllowFloatDomain \|\| !Subtarget.hasSSE41())) {
	std::swap(V1, V2);
	Shuffle = X86ISD::MOVSD;
	ShuffleVT = MaskVT;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
	(AllowFloatDomain \|\| !Subtarget.hasSSE41())) {
	Shuffle = X86ISD::MOVSS;
	ShuffleVT = MaskVT;
	return true;
	}
	}

	// Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
	if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) \|\|
	(MaskVT.is128BitVector() && Subtarget.hasSSE2()) \|\|
	(MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasAVX2()) \|\|
	(MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
	if (matchVectorShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL,
	DAG, Subtarget)) {
	ShuffleVT = MaskVT;
	if (ShuffleVT.is256BitVector() && !Subtarget.hasAVX2())
	ShuffleVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
	return true;
	}
	}

	return false;
	}

	static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
	const APInt &Zeroable,
	bool AllowFloatDomain,
	bool AllowIntDomain,
	SDValue &V1, SDValue &V2, SDLoc &DL,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	unsigned &Shuffle, MVT &ShuffleVT,
	unsigned &PermuteImm) {
	unsigned NumMaskElts = Mask.size();
	unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();

	// Attempt to match against PALIGNR byte rotate.
	if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
	int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask);
	if (0 < ByteRotation) {
	Shuffle = X86ISD::PALIGNR;
	ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
	PermuteImm = ByteRotation;
	return true;
	}
	}

	// Attempt to combine to X86ISD::BLENDI.
	if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) \|\|
	(Subtarget.hasAVX() && MaskVT.is256BitVector()))) \|\|
	(MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
	uint64_t BlendMask = 0;
	bool ForceV1Zero = false, ForceV2Zero = false;
	SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
	if (matchVectorShuffleAsBlend(V1, V2, TargetMask, ForceV1Zero, ForceV2Zero,
	BlendMask)) {
	if (MaskVT == MVT::v16i16) {
	// We can only use v16i16 PBLENDW if the lanes are repeated.
	SmallVector<int, 8> RepeatedMask;
	if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
	RepeatedMask)) {
	assert(RepeatedMask.size() == 8 &&
	"Repeated mask size doesn't match!");
	PermuteImm = 0;
	for (int i = 0; i < 8; ++i)
	if (RepeatedMask[i] >= 8)
	PermuteImm \|= 1 << i;
	V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
	V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
	Shuffle = X86ISD::BLENDI;
	ShuffleVT = MaskVT;
	return true;
	}
	} else {
	// Determine a type compatible with X86ISD::BLENDI.
	ShuffleVT = MaskVT;
	if (Subtarget.hasAVX2()) {
	if (ShuffleVT == MVT::v4i64)
	ShuffleVT = MVT::v8i32;
	else if (ShuffleVT == MVT::v2i64)
	ShuffleVT = MVT::v4i32;
	} else {
	if (ShuffleVT == MVT::v2i64 \|\| ShuffleVT == MVT::v4i32)
	ShuffleVT = MVT::v8i16;
	else if (ShuffleVT == MVT::v4i64)
	ShuffleVT = MVT::v4f64;
	else if (ShuffleVT == MVT::v8i32)
	ShuffleVT = MVT::v8f32;
	}

	if (!ShuffleVT.isFloatingPoint()) {
	int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits();
	BlendMask =
	scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale);
	ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale);
	ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale);
	}

	V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
	V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
	PermuteImm = (unsigned)BlendMask;
	Shuffle = X86ISD::BLENDI;
	return true;
	}
	}
	}

	// Attempt to combine to INSERTPS.
	if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
	MaskVT.is128BitVector()) {
	if (Zeroable.getBoolValue() &&
	matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
	Shuffle = X86ISD::INSERTPS;
	ShuffleVT = MVT::v4f32;
	return true;
	}
	}

	// Attempt to combine to SHUFPD.
	if (AllowFloatDomain && EltSizeInBits == 64 &&
	((MaskVT.is128BitVector() && Subtarget.hasSSE2()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasAVX()) \|\|
	(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
	if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
	Shuffle = X86ISD::SHUFP;
	ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
	return true;
	}
	}

	// Attempt to combine to SHUFPS.
	if (AllowFloatDomain && EltSizeInBits == 32 &&
	((MaskVT.is128BitVector() && Subtarget.hasSSE1()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasAVX()) \|\|
	(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
	SmallVector<int, 4> RepeatedMask;
	if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
	// Match each half of the repeated mask, to determine if its just
	// referencing one of the vectors, is zeroable or entirely undef.
	auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
	int M0 = RepeatedMask[Offset];
	int M1 = RepeatedMask[Offset + 1];

	if (isUndefInRange(RepeatedMask, Offset, 2)) {
	return DAG.getUNDEF(MaskVT);
	} else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
	S0 = (SM_SentinelUndef == M0 ? -1 : 0);
	S1 = (SM_SentinelUndef == M1 ? -1 : 1);
	return getZeroVector(MaskVT, Subtarget, DAG, DL);
	} else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
	S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
	S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
	return V1;
	} else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
	S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
	S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
	return V2;
	}

	return SDValue();
	};

	int ShufMask[4] = {-1, -1, -1, -1};
	SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
	SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);

	if (Lo && Hi) {
	V1 = Lo;
	V2 = Hi;
	Shuffle = X86ISD::SHUFP;
	ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
	PermuteImm = getV4X86ShuffleImm(ShufMask);
	return true;
	}
	}
	}

	return false;
	}

	/// \brief Combine an arbitrary chain of shuffles into a single instruction if
	/// possible.
	///
	/// This is the leaf of the recursive combine below. When we have found some
	/// chain of single-use x86 shuffle instructions and accumulated the combined
	/// shuffle mask represented by them, this will try to pattern match that mask
	/// into either a single instruction if there is a special purpose instruction
	/// for this operation, or into a PSHUFB instruction which is a fully general
	/// instruction but should only be used to replace chains over a certain depth.
	static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
	ArrayRef<int> BaseMask, int Depth,
	bool HasVariableMask, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
	assert((Inputs.size() == 1 \|\| Inputs.size() == 2) &&
	"Unexpected number of shuffle inputs!");

	// Find the inputs that enter the chain. Note that multiple uses are OK
	// here, we're not going to remove the operands we find.
	bool UnaryShuffle = (Inputs.size() == 1);
	SDValue V1 = peekThroughBitcasts(Inputs[0]);
	SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
	: peekThroughBitcasts(Inputs[1]));

	MVT VT1 = V1.getSimpleValueType();
	MVT VT2 = V2.getSimpleValueType();
	MVT RootVT = Root.getSimpleValueType();
	assert(VT1.getSizeInBits() == RootVT.getSizeInBits() &&
	VT2.getSizeInBits() == RootVT.getSizeInBits() &&
	"Vector size mismatch");

	SDLoc DL(Root);
	SDValue Res;

	unsigned NumBaseMaskElts = BaseMask.size();
	if (NumBaseMaskElts == 1) {
	assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
	DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, V1),
	/AddTo/ true);
	return true;
	}

	unsigned RootSizeInBits = RootVT.getSizeInBits();
	unsigned NumRootElts = RootVT.getVectorNumElements();
	unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
	bool FloatDomain = VT1.isFloatingPoint() \|\| VT2.isFloatingPoint() \|\|
	(RootVT.is256BitVector() && !Subtarget.hasAVX2());

	// Don't combine if we are a AVX512/EVEX target and the mask element size
	// is different from the root element size - this would prevent writemasks
	// from being reused.
	// TODO - this currently prevents all lane shuffles from occurring.
	// TODO - check for writemasks usage instead of always preventing combining.
	// TODO - attempt to narrow Mask back to writemask size.
	bool IsEVEXShuffle =
	RootSizeInBits == 512 \|\| (Subtarget.hasVLX() && RootSizeInBits >= 128);
	if (IsEVEXShuffle && (RootVT.getScalarSizeInBits() != BaseMaskEltSizeInBits))
	return false;

	// TODO - handle 128/256-bit lane shuffles of 512-bit vectors.

	// Handle 128-bit lane shuffles of 256-bit vectors.
	// TODO - this should support binary shuffles.
	if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&
	!isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
	if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)
	return false; // Nothing to do!
	MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
	unsigned PermMask = 0;
	PermMask \|= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
	PermMask \|= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);

	Res = DAG.getBitcast(ShuffleVT, V1);
	DCI.AddToWorklist(Res.getNode());
	Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
	DAG.getUNDEF(ShuffleVT),
	DAG.getConstant(PermMask, DL, MVT::i8));
	DCI.AddToWorklist(Res.getNode());
	DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
	/AddTo/ true);
	return true;
	}

	// For masks that have been widened to 128-bit elements or more,
	// narrow back down to 64-bit elements.
	SmallVector<int, 64> Mask;
	if (BaseMaskEltSizeInBits > 64) {
	assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
	int MaskScale = BaseMaskEltSizeInBits / 64;
	scaleShuffleMask(MaskScale, BaseMask, Mask);
	} else {
	Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
	}

	unsigned NumMaskElts = Mask.size();
	unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;

	// Determine the effective mask value type.
	FloatDomain &= (32 <= MaskEltSizeInBits);
	MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
	: MVT::getIntegerVT(MaskEltSizeInBits);
	MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);

	// Only allow legal mask types.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
	return false;

	// Attempt to match the mask against known shuffle patterns.
	MVT ShuffleSrcVT, ShuffleVT;
	unsigned Shuffle, PermuteImm;

	// Which shuffle domains are permitted?
	// Permit domain crossing at higher combine depths.
	bool AllowFloatDomain = FloatDomain \|\| (Depth > 3);
	bool AllowIntDomain = (!FloatDomain \|\| (Depth > 3)) &&
	(!MaskVT.is256BitVector() \|\| Subtarget.hasAVX2());

	// Determine zeroable mask elements.
	APInt Zeroable(NumMaskElts, 0);
	for (unsigned i = 0; i != NumMaskElts; ++i)
	if (isUndefOrZero(Mask[i]))
	Zeroable.setBit(i);

	if (UnaryShuffle) {
	// If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
	// directly if we don't shuffle the lower element and we shuffle the upper
	// (zero) elements within themselves.
	if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
	(V1.getScalarValueSizeInBits() % MaskEltSizeInBits) == 0) {
	unsigned Scale = V1.getScalarValueSizeInBits() / MaskEltSizeInBits;
	ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
	if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
	isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {
	DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, V1),
	/AddTo/ true);
	return true;
	}
	}

	if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
	V1, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
	ShuffleVT)) {
	if (Depth == 1 && Root.getOpcode() == Shuffle)
	return false; // Nothing to do!
	if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
	return false; // AVX512 Writemask clash.
	Res = DAG.getBitcast(ShuffleSrcVT, V1);
	DCI.AddToWorklist(Res.getNode());
	Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
	DCI.AddToWorklist(Res.getNode());
	DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
	/AddTo/ true);
	return true;
	}

	if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
	AllowIntDomain, Subtarget, Shuffle,
	ShuffleVT, PermuteImm)) {
	if (Depth == 1 && Root.getOpcode() == Shuffle)
	return false; // Nothing to do!
	if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
	return false; // AVX512 Writemask clash.
	Res = DAG.getBitcast(ShuffleVT, V1);
	DCI.AddToWorklist(Res.getNode());
	Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
	DAG.getConstant(PermuteImm, DL, MVT::i8));
	DCI.AddToWorklist(Res.getNode());
	DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
	/AddTo/ true);
	return true;
	}
	}

	if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
	V1, V2, DL, DAG, Subtarget, Shuffle, ShuffleVT,
	UnaryShuffle)) {
	if (Depth == 1 && Root.getOpcode() == Shuffle)
	return false; // Nothing to do!
	if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
	return false; // AVX512 Writemask clash.
	V1 = DAG.getBitcast(ShuffleVT, V1);
	DCI.AddToWorklist(V1.getNode());
	V2 = DAG.getBitcast(ShuffleVT, V2);
	DCI.AddToWorklist(V2.getNode());
	Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2);
	DCI.AddToWorklist(Res.getNode());
	DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
	/AddTo/ true);
	return true;
	}

	if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
	AllowIntDomain, V1, V2, DL, DAG,
	Subtarget, Shuffle, ShuffleVT,
	PermuteImm)) {
	if (Depth == 1 && Root.getOpcode() == Shuffle)
	return false; // Nothing to do!
	if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
	return false; // AVX512 Writemask clash.
	V1 = DAG.getBitcast(ShuffleVT, V1);
	DCI.AddToWorklist(V1.getNode());
	V2 = DAG.getBitcast(ShuffleVT, V2);
	DCI.AddToWorklist(V2.getNode());
	Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2,
	DAG.getConstant(PermuteImm, DL, MVT::i8));
	DCI.AddToWorklist(Res.getNode());
	DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
	/AddTo/ true);
	return true;
	}

	// Typically from here on, we need an integer version of MaskVT.
	MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
	IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);

	// Annoyingly, SSE4A instructions don't map into the above match helpers.
	if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
	uint64_t BitLen, BitIdx;
	if (matchVectorShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
	Zeroable)) {
	if (Depth == 1 && Root.getOpcode() == X86ISD::EXTRQI)
	return false; // Nothing to do!
	V1 = DAG.getBitcast(IntMaskVT, V1);
	DCI.AddToWorklist(V1.getNode());
	Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
	DAG.getConstant(BitLen, DL, MVT::i8),
	DAG.getConstant(BitIdx, DL, MVT::i8));
	DCI.AddToWorklist(Res.getNode());
	DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
	/AddTo/ true);
	return true;
	}

	if (matchVectorShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
	if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTQI)
	return false; // Nothing to do!
	V1 = DAG.getBitcast(IntMaskVT, V1);
	DCI.AddToWorklist(V1.getNode());
	V2 = DAG.getBitcast(IntMaskVT, V2);
	DCI.AddToWorklist(V2.getNode());
	Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
	DAG.getConstant(BitLen, DL, MVT::i8),
	DAG.getConstant(BitIdx, DL, MVT::i8));
	DCI.AddToWorklist(Res.getNode());
	DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
	/AddTo/ true);
	return true;
	}
	}

	// Don't try to re-form single instruction chains under any circumstances now
	// that we've done encoding canonicalization for them.
	if (Depth < 2)
	return false;

	bool MaskContainsZeros =
	any_of(Mask, [](int M) { return M == SM_SentinelZero; });

	if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
	// If we have a single input lane-crossing shuffle then lower to VPERMV.
	if (UnaryShuffle && (Depth >= 3 \|\| HasVariableMask) && !MaskContainsZeros &&
	((Subtarget.hasAVX2() &&
	(MaskVT == MVT::v8f32 \|\| MaskVT == MVT::v8i32)) \|\|
	(Subtarget.hasAVX512() &&
	(MaskVT == MVT::v8f64 \|\| MaskVT == MVT::v8i64 \|\|
	MaskVT == MVT::v16f32 \|\| MaskVT == MVT::v16i32)) \|\|
	(Subtarget.hasBWI() && MaskVT == MVT::v32i16) \|\|
	(Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) \|\|
	(Subtarget.hasVBMI() && MaskVT == MVT::v64i8) \|\|
	(Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
	SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
	DCI.AddToWorklist(VPermMask.getNode());
	Res = DAG.getBitcast(MaskVT, V1);
	DCI.AddToWorklist(Res.getNode());
	Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
	DCI.AddToWorklist(Res.getNode());
	DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
	/AddTo/ true);
	return true;
	}

	// Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
	// vector as the second source.
	if (UnaryShuffle && (Depth >= 3 \|\| HasVariableMask) &&
	((Subtarget.hasAVX512() &&
	(MaskVT == MVT::v8f64 \|\| MaskVT == MVT::v8i64 \|\|
	MaskVT == MVT::v16f32 \|\| MaskVT == MVT::v16i32)) \|\|
	(Subtarget.hasVLX() &&
	(MaskVT == MVT::v4f64 \|\| MaskVT == MVT::v4i64 \|\|
	MaskVT == MVT::v8f32 \|\| MaskVT == MVT::v8i32)) \|\|
	(Subtarget.hasBWI() && MaskVT == MVT::v32i16) \|\|
	(Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) \|\|
	(Subtarget.hasVBMI() && MaskVT == MVT::v64i8) \|\|
	(Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
	// Adjust shuffle mask - replace SM_SentinelZero with second source index.
	for (unsigned i = 0; i != NumMaskElts; ++i)
	if (Mask[i] == SM_SentinelZero)
	Mask[i] = NumMaskElts + i;

	SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
	DCI.AddToWorklist(VPermMask.getNode());
	Res = DAG.getBitcast(MaskVT, V1);
	DCI.AddToWorklist(Res.getNode());
	SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);
	DCI.AddToWorklist(Zero.getNode());
	Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
	DCI.AddToWorklist(Res.getNode());
	DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
	/AddTo/ true);
	return true;
	}

	// If we have a dual input lane-crossing shuffle then lower to VPERMV3.
	if ((Depth >= 3 \|\| HasVariableMask) && !MaskContainsZeros &&
	((Subtarget.hasAVX512() &&
	(MaskVT == MVT::v8f64 \|\| MaskVT == MVT::v8i64 \|\|
	MaskVT == MVT::v16f32 \|\| MaskVT == MVT::v16i32)) \|\|
	(Subtarget.hasVLX() &&
	(MaskVT == MVT::v4f64 \|\| MaskVT == MVT::v4i64 \|\|
	MaskVT == MVT::v8f32 \|\| MaskVT == MVT::v8i32)) \|\|
	(Subtarget.hasBWI() && MaskVT == MVT::v32i16) \|\|
	(Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) \|\|
	(Subtarget.hasVBMI() && MaskVT == MVT::v64i8) \|\|
	(Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
	SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
	DCI.AddToWorklist(VPermMask.getNode());
	V1 = DAG.getBitcast(MaskVT, V1);
	DCI.AddToWorklist(V1.getNode());
	V2 = DAG.getBitcast(MaskVT, V2);
	DCI.AddToWorklist(V2.getNode());
	Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
	DCI.AddToWorklist(Res.getNode());
	DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
	/AddTo/ true);
	return true;
	}
	return false;
	}

	// See if we can combine a single input shuffle with zeros to a bit-mask,
	// which is much simpler than any shuffle.
	if (UnaryShuffle && MaskContainsZeros && (Depth >= 3 \|\| HasVariableMask) &&
	isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
	DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
	APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
	APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
	APInt UndefElts(NumMaskElts, 0);
	SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
	for (unsigned i = 0; i != NumMaskElts; ++i) {
	int M = Mask[i];
	if (M == SM_SentinelUndef) {
	UndefElts.setBit(i);
	continue;
	}
	if (M == SM_SentinelZero)
	continue;
	EltBits[i] = AllOnes;
	}
	SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
	DCI.AddToWorklist(BitMask.getNode());
	Res = DAG.getBitcast(MaskVT, V1);
	DCI.AddToWorklist(Res.getNode());
	unsigned AndOpcode =
	FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
	Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
	DCI.AddToWorklist(Res.getNode());
	DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
	/AddTo/ true);
	return true;
	}

	// If we have a single input shuffle with different shuffle patterns in the
	// the 128-bit lanes use the variable mask to VPERMILPS.
	// TODO Combine other mask types at higher depths.
	if (UnaryShuffle && HasVariableMask && !MaskContainsZeros &&
	((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) \|\|
	(MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
	SmallVector<SDValue, 16> VPermIdx;
	for (int M : Mask) {
	SDValue Idx =
	M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
	VPermIdx.push_back(Idx);
	}
	SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
	DCI.AddToWorklist(VPermMask.getNode());
	Res = DAG.getBitcast(MaskVT, V1);
	DCI.AddToWorklist(Res.getNode());
	Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
	DCI.AddToWorklist(Res.getNode());
	DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
	/AddTo/ true);
	return true;
	}

	// With XOP, binary shuffles of 128/256-bit floating point vectors can combine
	// to VPERMIL2PD/VPERMIL2PS.
	if ((Depth >= 3 \|\| HasVariableMask) && Subtarget.hasXOP() &&
	(MaskVT == MVT::v2f64 \|\| MaskVT == MVT::v4f64 \|\| MaskVT == MVT::v4f32 \|\|
	MaskVT == MVT::v8f32)) {
	// VPERMIL2 Operation.
	// Bits[3] - Match Bit.
	// Bits[2:1] - (Per Lane) PD Shuffle Mask.
	// Bits[2:0] - (Per Lane) PS Shuffle Mask.
	unsigned NumLanes = MaskVT.getSizeInBits() / 128;
	unsigned NumEltsPerLane = NumMaskElts / NumLanes;
	SmallVector<int, 8> VPerm2Idx;
	unsigned M2ZImm = 0;
	for (int M : Mask) {
	if (M == SM_SentinelUndef) {
	VPerm2Idx.push_back(-1);
	continue;
	}
	if (M == SM_SentinelZero) {
	M2ZImm = 2;
	VPerm2Idx.push_back(8);
	continue;
	}
	int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
	Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
	VPerm2Idx.push_back(Index);
	}
	V1 = DAG.getBitcast(MaskVT, V1);
	DCI.AddToWorklist(V1.getNode());
	V2 = DAG.getBitcast(MaskVT, V2);
	DCI.AddToWorklist(V2.getNode());
	SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
	DCI.AddToWorklist(VPerm2MaskOp.getNode());
	Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
	DAG.getConstant(M2ZImm, DL, MVT::i8));
	DCI.AddToWorklist(Res.getNode());
	DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
	/AddTo/ true);
	return true;
	}

	// If we have 3 or more shuffle instructions or a chain involving a variable
	// mask, we can replace them with a single PSHUFB instruction profitably.
	// Intel's manuals suggest only using PSHUFB if doing so replacing 5
	// instructions, but in practice PSHUFB tends to be very fast so we're
	// more aggressive.
	if (UnaryShuffle && (Depth >= 3 \|\| HasVariableMask) &&
	((RootVT.is128BitVector() && Subtarget.hasSSSE3()) \|\|
	(RootVT.is256BitVector() && Subtarget.hasAVX2()) \|\|
	(RootVT.is512BitVector() && Subtarget.hasBWI()))) {
	SmallVector<SDValue, 16> PSHUFBMask;
	int NumBytes = RootVT.getSizeInBits() / 8;
	int Ratio = NumBytes / NumMaskElts;
	for (int i = 0; i < NumBytes; ++i) {
	int M = Mask[i / Ratio];
	if (M == SM_SentinelUndef) {
	PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
	continue;
	}
	if (M == SM_SentinelZero) {
	PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
	continue;
	}
	M = Ratio * M + i % Ratio;
	assert ((M / 16) == (i / 16) && "Lane crossing detected");
	PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
	}
	MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
	Res = DAG.getBitcast(ByteVT, V1);
	DCI.AddToWorklist(Res.getNode());
	SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
	DCI.AddToWorklist(PSHUFBMaskOp.getNode());
	Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
	DCI.AddToWorklist(Res.getNode());
	DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
	/AddTo/ true);
	return true;
	}

	// With XOP, if we have a 128-bit binary input shuffle we can always combine
	// to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
	// slower than PSHUFB on targets that support both.
	if ((Depth >= 3 \|\| HasVariableMask) && RootVT.is128BitVector() &&
	Subtarget.hasXOP()) {
	// VPPERM Mask Operation
	// Bits[4:0] - Byte Index (0 - 31)
	// Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
	SmallVector<SDValue, 16> VPPERMMask;
	int NumBytes = 16;
	int Ratio = NumBytes / NumMaskElts;
	for (int i = 0; i < NumBytes; ++i) {
	int M = Mask[i / Ratio];
	if (M == SM_SentinelUndef) {
	VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
	continue;
	}
	if (M == SM_SentinelZero) {
	VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8));
	continue;
	}
	M = Ratio * M + i % Ratio;
	VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
	}
	MVT ByteVT = MVT::v16i8;
	V1 = DAG.getBitcast(ByteVT, V1);
	DCI.AddToWorklist(V1.getNode());
	V2 = DAG.getBitcast(ByteVT, V2);
	DCI.AddToWorklist(V2.getNode());
	SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
	DCI.AddToWorklist(VPPERMMaskOp.getNode());
	Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
	DCI.AddToWorklist(Res.getNode());
	DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
	/AddTo/ true);
	return true;
	}

	// Failed to find any combines.
	return false;
	}

	// Attempt to constant fold all of the constant source ops.
	// Returns true if the entire shuffle is folded to a constant.
	// TODO: Extend this to merge multiple constant Ops and update the mask.
	static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
	ArrayRef<int> Mask, SDValue Root,
	bool HasVariableMask, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	MVT VT = Root.getSimpleValueType();

	unsigned SizeInBits = VT.getSizeInBits();
	unsigned NumMaskElts = Mask.size();
	unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
	unsigned NumOps = Ops.size();

	// Extract constant bits from each source op.
	bool OneUseConstantOp = false;
	SmallVector<APInt, 16> UndefEltsOps(NumOps);
	SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
	for (unsigned i = 0; i != NumOps; ++i) {
	SDValue SrcOp = Ops[i];
	OneUseConstantOp \|= SrcOp.hasOneUse();
	if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
	RawBitsOps[i]))
	return false;
	}

	// Only fold if at least one of the constants is only used once or
	// the combined shuffle has included a variable mask shuffle, this
	// is to avoid constant pool bloat.
	if (!OneUseConstantOp && !HasVariableMask)
	return false;

	// Shuffle the constant bits according to the mask.
	APInt UndefElts(NumMaskElts, 0);
	APInt ZeroElts(NumMaskElts, 0);
	APInt ConstantElts(NumMaskElts, 0);
	SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
	APInt::getNullValue(MaskSizeInBits));
	for (unsigned i = 0; i != NumMaskElts; ++i) {
	int M = Mask[i];
	if (M == SM_SentinelUndef) {
	UndefElts.setBit(i);
	continue;
	} else if (M == SM_SentinelZero) {
	ZeroElts.setBit(i);
	continue;
	}
	assert(0 <= M && M < (int)(NumMaskElts * NumOps));

	unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
	unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;

	auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
	if (SrcUndefElts[SrcMaskIdx]) {
	UndefElts.setBit(i);
	continue;
	}

	auto &SrcEltBits = RawBitsOps[SrcOpIdx];
	APInt &Bits = SrcEltBits[SrcMaskIdx];
	if (!Bits) {
	ZeroElts.setBit(i);
	continue;
	}

	ConstantElts.setBit(i);
	ConstantBitData[i] = Bits;
	}
	assert((UndefElts \| ZeroElts \| ConstantElts).isAllOnesValue());

	// Create the constant data.
	MVT MaskSVT;
	if (VT.isFloatingPoint() && (MaskSizeInBits == 32 \|\| MaskSizeInBits == 64))
	MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
	else
	MaskSVT = MVT::getIntegerVT(MaskSizeInBits);

	MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);

	SDLoc DL(Root);
	SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
	DCI.AddToWorklist(CstOp.getNode());
	DCI.CombineTo(Root.getNode(), DAG.getBitcast(VT, CstOp));
	return true;
	}

	/// \brief Fully generic combining of x86 shuffle instructions.
	///
	/// This should be the last combine run over the x86 shuffle instructions. Once
	/// they have been fully optimized, this will recursively consider all chains
	/// of single-use shuffle instructions, build a generic model of the cumulative
	/// shuffle operation, and check for simpler instructions which implement this
	/// operation. We use this primarily for two purposes:
	///
	/// 1) Collapse generic shuffles to specialized single instructions when
	/// equivalent. In most cases, this is just an encoding size win, but
	/// sometimes we will collapse multiple generic shuffles into a single
	/// special-purpose shuffle.
	/// 2) Look for sequences of shuffle instructions with 3 or more total
	/// instructions, and replace them with the slightly more expensive SSSE3
	/// PSHUFB instruction if available. We do this as the last combining step
	/// to ensure we avoid using PSHUFB if we can implement the shuffle with
	/// a suitable short sequence of other instructions. The PSHUFB will either
	/// use a register or have to read from memory and so is slightly (but only
	/// slightly) more expensive than the other shuffle instructions.
	///
	/// Because this is inherently a quadratic operation (for each shuffle in
	/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
	/// This should never be an issue in practice as the shuffle lowering doesn't
	/// produce sequences of more than 8 instructions.
	///
	/// FIXME: We will currently miss some cases where the redundant shuffling
	/// would simplify under the threshold for PSHUFB formation because of
	/// combine-ordering. To fix this, we should do the redundant instruction
	/// combining in this recursive walk.
	static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
	int SrcOpIndex, SDValue Root,
	ArrayRef<int> RootMask,
	ArrayRef<const SDNode*> SrcNodes,
	int Depth, bool HasVariableMask,
	SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	// Bound the depth of our recursive combine because this is ultimately
	// quadratic in nature.
	if (Depth > 8)
	return false;

	// Directly rip through bitcasts to find the underlying operand.
	SDValue Op = SrcOps[SrcOpIndex];
	Op = peekThroughOneUseBitcasts(Op);

	MVT VT = Op.getSimpleValueType();
	if (!VT.isVector())
	return false; // Bail if we hit a non-vector.

	assert(Root.getSimpleValueType().isVector() &&
	"Shuffles operate on vector types!");
	assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
	"Can only combine shuffles of the same vector register size.");

	// Extract target shuffle mask and resolve sentinels and inputs.
	SmallVector<int, 64> OpMask;
	SmallVector<SDValue, 2> OpInputs;
	if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, DAG))
	return false;

	assert(OpInputs.size() <= 2 && "Too many shuffle inputs");
	SDValue Input0 = (OpInputs.size() > 0 ? OpInputs[0] : SDValue());
	SDValue Input1 = (OpInputs.size() > 1 ? OpInputs[1] : SDValue());

	// Add the inputs to the Ops list, avoiding duplicates.
	SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end());

	int InputIdx0 = -1, InputIdx1 = -1;
	for (int i = 0, e = Ops.size(); i < e; ++i) {
	SDValue BC = peekThroughBitcasts(Ops[i]);
	if (Input0 && BC == peekThroughBitcasts(Input0))
	InputIdx0 = i;
	if (Input1 && BC == peekThroughBitcasts(Input1))
	InputIdx1 = i;
	}

	if (Input0 && InputIdx0 < 0) {
	InputIdx0 = SrcOpIndex;
	Ops[SrcOpIndex] = Input0;
	}
	if (Input1 && InputIdx1 < 0) {
	InputIdx1 = Ops.size();
	Ops.push_back(Input1);
	}

	assert(((RootMask.size() > OpMask.size() &&
	RootMask.size() % OpMask.size() == 0) \|\|
	(OpMask.size() > RootMask.size() &&
	OpMask.size() % RootMask.size() == 0) \|\|
	OpMask.size() == RootMask.size()) &&
	"The smaller number of elements must divide the larger.");

	// This function can be performance-critical, so we rely on the power-of-2
	// knowledge that we have about the mask sizes to replace div/rem ops with
	// bit-masks and shifts.
	assert(isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes");
	assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes");
	unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
	unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());

	unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
	unsigned RootRatio = std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
	unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
	assert((RootRatio == 1 \|\| OpRatio == 1) &&
	"Must not have a ratio for both incoming and op masks!");

	assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
	assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
	assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
	unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
	unsigned OpRatioLog2 = countTrailingZeros(OpRatio);

	SmallVector<int, 64> Mask(MaskWidth, SM_SentinelUndef);

	// Merge this shuffle operation's mask into our accumulated mask. Note that
	// this shuffle's mask will be the first applied to the input, followed by the
	// root mask to get us all the way to the root value arrangement. The reason
	// for this order is that we are recursing up the operation chain.
	for (unsigned i = 0; i < MaskWidth; ++i) {
	unsigned RootIdx = i >> RootRatioLog2;
	if (RootMask[RootIdx] < 0) {
	// This is a zero or undef lane, we're done.
	Mask[i] = RootMask[RootIdx];
	continue;
	}

	unsigned RootMaskedIdx =
	RootRatio == 1
	? RootMask[RootIdx]
	: (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));

	// Just insert the scaled root mask value if it references an input other
	// than the SrcOp we're currently inserting.
	if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) \|\|
	(((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
	Mask[i] = RootMaskedIdx;
	continue;
	}

	RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
	unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
	if (OpMask[OpIdx] < 0) {
	// The incoming lanes are zero or undef, it doesn't matter which ones we
	// are using.
	Mask[i] = OpMask[OpIdx];
	continue;
	}

	// Ok, we have non-zero lanes, map them through to one of the Op's inputs.
	unsigned OpMaskedIdx =
	OpRatio == 1
	? OpMask[OpIdx]
	: (OpMask[OpIdx] << OpRatioLog2) + (RootMaskedIdx & (OpRatio - 1));

	OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
	if (OpMask[OpIdx] < (int)OpMask.size()) {
	assert(0 <= InputIdx0 && "Unknown target shuffle input");
	OpMaskedIdx += InputIdx0 * MaskWidth;
	} else {
	assert(0 <= InputIdx1 && "Unknown target shuffle input");
	OpMaskedIdx += InputIdx1 * MaskWidth;
	}

	Mask[i] = OpMaskedIdx;
	}

	// Handle the all undef/zero cases early.
	if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; })) {
	DCI.CombineTo(Root.getNode(), DAG.getUNDEF(Root.getValueType()));
	return true;
	}
	if (all_of(Mask, [](int Idx) { return Idx < 0; })) {
	// TODO - should we handle the mixed zero/undef case as well? Just returning
	// a zero mask will lose information on undef elements possibly reducing
	// future combine possibilities.
	DCI.CombineTo(Root.getNode(), getZeroVector(Root.getSimpleValueType(),
	Subtarget, DAG, SDLoc(Root)));
	return true;
	}

	// Remove unused shuffle source ops.
	resolveTargetShuffleInputsAndMask(Ops, Mask);
	assert(!Ops.empty() && "Shuffle with no inputs detected");

	HasVariableMask \|= isTargetShuffleVariableMask(Op.getOpcode());

	// Update the list of shuffle nodes that have been combined so far.
	SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
	SrcNodes.end());
	CombinedNodes.push_back(Op.getNode());

	// See if we can recurse into each shuffle source op (if it's a target
	// shuffle). The source op should only be combined if it either has a
	// single use (i.e. current Op) or all its users have already been combined.
	for (int i = 0, e = Ops.size(); i < e; ++i)
	if (Ops[i].getNode()->hasOneUse() \|\|
	SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
	if (combineX86ShufflesRecursively(Ops, i, Root, Mask, CombinedNodes,
	Depth + 1, HasVariableMask, DAG, DCI,
	Subtarget))
	return true;

	// Attempt to constant fold all of the constant source ops.
	if (combineX86ShufflesConstants(Ops, Mask, Root, HasVariableMask, DAG, DCI,
	Subtarget))
	return true;

	// We can only combine unary and binary shuffle mask cases.
	if (Ops.size() > 2)
	return false;

	// Minor canonicalization of the accumulated shuffle mask to make it easier
	// to match below. All this does is detect masks with sequential pairs of
	// elements, and shrink them to the half-width mask. It does this in a loop
	// so it will reduce the size of the mask to the minimal width mask which
	// performs an equivalent shuffle.
	SmallVector<int, 64> WidenedMask;
	while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
	Mask = std::move(WidenedMask);
	}

	// Canonicalization of binary shuffle masks to improve pattern matching by
	// commuting the inputs.
	if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
	ShuffleVectorSDNode::commuteMask(Mask);
	std::swap(Ops[0], Ops[1]);
	}

	return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, DAG,
	DCI, Subtarget);
	}

	/// \brief Get the PSHUF-style mask from PSHUF node.
	///
	/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
	/// PSHUF-style masks that can be reused with such instructions.
	static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
	MVT VT = N.getSimpleValueType();
	SmallVector<int, 4> Mask;
	SmallVector<SDValue, 2> Ops;
	bool IsUnary;
	bool HaveMask =
	getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
	(void)HaveMask;
	assert(HaveMask);

	// If we have more than 128-bits, only the low 128-bits of shuffle mask
	// matter. Check that the upper masks are repeats and remove them.
	if (VT.getSizeInBits() > 128) {
	int LaneElts = 128 / VT.getScalarSizeInBits();
	#ifndef NDEBUG
	for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
	for (int j = 0; j < LaneElts; ++j)
	assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
	"Mask doesn't repeat in high 128-bit lanes!");
	#endif
	Mask.resize(LaneElts);
	}

	switch (N.getOpcode()) {
	case X86ISD::PSHUFD:
	return Mask;
	case X86ISD::PSHUFLW:
	Mask.resize(4);
	return Mask;
	case X86ISD::PSHUFHW:
	Mask.erase(Mask.begin(), Mask.begin() + 4);
	for (int &M : Mask)
	M -= 4;
	return Mask;
	default:
	llvm_unreachable("No valid shuffle instruction found!");
	}
	}

	/// \brief Search for a combinable shuffle across a chain ending in pshufd.
	///
	/// We walk up the chain and look for a combinable shuffle, skipping over
	/// shuffles that we could hoist this shuffle's transformation past without
	/// altering anything.
	static SDValue
	combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
	SelectionDAG &DAG) {
	assert(N.getOpcode() == X86ISD::PSHUFD &&
	"Called with something other than an x86 128-bit half shuffle!");
	SDLoc DL(N);

	// Walk up a single-use chain looking for a combinable shuffle. Keep a stack
	// of the shuffles in the chain so that we can form a fresh chain to replace
	// this one.
	SmallVector<SDValue, 8> Chain;
	SDValue V = N.getOperand(0);
	for (; V.hasOneUse(); V = V.getOperand(0)) {
	switch (V.getOpcode()) {
	default:
	return SDValue(); // Nothing combined!

	case ISD::BITCAST:
	// Skip bitcasts as we always know the type for the target specific
	// instructions.
	continue;

	case X86ISD::PSHUFD:
	// Found another dword shuffle.
	break;

	case X86ISD::PSHUFLW:
	// Check that the low words (being shuffled) are the identity in the
	// dword shuffle, and the high words are self-contained.
	if (Mask[0] != 0 \|\| Mask[1] != 1 \|\|
	!(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
	return SDValue();

	Chain.push_back(V);
	continue;

	case X86ISD::PSHUFHW:
	// Check that the high words (being shuffled) are the identity in the
	// dword shuffle, and the low words are self-contained.
	if (Mask[2] != 2 \|\| Mask[3] != 3 \|\|
	!(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
	return SDValue();

	Chain.push_back(V);
	continue;

	case X86ISD::UNPCKL:
	case X86ISD::UNPCKH:
	// For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
	// shuffle into a preceding word shuffle.
	if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
	V.getSimpleValueType().getVectorElementType() != MVT::i16)
	return SDValue();

	// Search for a half-shuffle which we can combine with.
	unsigned CombineOp =
	V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
	if (V.getOperand(0) != V.getOperand(1) \|\|
	!V->isOnlyUserOf(V.getOperand(0).getNode()))
	return SDValue();
	Chain.push_back(V);
	V = V.getOperand(0);
	do {
	switch (V.getOpcode()) {
	default:
	return SDValue(); // Nothing to combine.

	case X86ISD::PSHUFLW:
	case X86ISD::PSHUFHW:
	if (V.getOpcode() == CombineOp)
	break;

	Chain.push_back(V);

	LLVM_FALLTHROUGH;
	case ISD::BITCAST:
	V = V.getOperand(0);
	continue;
	}
	break;
	} while (V.hasOneUse());
	break;
	}
	// Break out of the loop if we break out of the switch.
	break;
	}

	if (!V.hasOneUse())
	// We fell out of the loop without finding a viable combining instruction.
	return SDValue();

	// Merge this node's mask and our incoming mask.
	SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
	for (int &M : Mask)
	M = VMask[M];
	V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

	// Rebuild the chain around this new shuffle.
	while (!Chain.empty()) {
	SDValue W = Chain.pop_back_val();

	if (V.getValueType() != W.getOperand(0).getValueType())
	V = DAG.getBitcast(W.getOperand(0).getValueType(), V);

	switch (W.getOpcode()) {
	default:
	llvm_unreachable("Only PSHUF and UNPCK instructions get here!");

	case X86ISD::UNPCKL:
	case X86ISD::UNPCKH:
	V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
	break;

	case X86ISD::PSHUFD:
	case X86ISD::PSHUFLW:
	case X86ISD::PSHUFHW:
	V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
	break;
	}
	}
	if (V.getValueType() != N.getValueType())
	V = DAG.getBitcast(N.getValueType(), V);

	// Return the new chain to replace N.
	return V;
	}

	/// \brief Search for a combinable shuffle across a chain ending in pshuflw or
	/// pshufhw.
	///
	/// We walk up the chain, skipping shuffles of the other half and looking
	/// through shuffles which switch halves trying to find a shuffle of the same
	/// pair of dwords.
	static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
	SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	assert(
	(N.getOpcode() == X86ISD::PSHUFLW \|\| N.getOpcode() == X86ISD::PSHUFHW) &&
	"Called with something other than an x86 128-bit half shuffle!");
	SDLoc DL(N);
	unsigned CombineOpcode = N.getOpcode();

	// Walk up a single-use chain looking for a combinable shuffle.
	SDValue V = N.getOperand(0);
	for (; V.hasOneUse(); V = V.getOperand(0)) {
	switch (V.getOpcode()) {
	default:
	return false; // Nothing combined!

	case ISD::BITCAST:
	// Skip bitcasts as we always know the type for the target specific
	// instructions.
	continue;

	case X86ISD::PSHUFLW:
	case X86ISD::PSHUFHW:
	if (V.getOpcode() == CombineOpcode)
	break;

	// Other-half shuffles are no-ops.
	continue;
	}
	// Break out of the loop if we break out of the switch.
	break;
	}

	if (!V.hasOneUse())
	// We fell out of the loop without finding a viable combining instruction.
	return false;

	// Combine away the bottom node as its shuffle will be accumulated into
	// a preceding shuffle.
	DCI.CombineTo(N.getNode(), N.getOperand(0), /AddTo/ true);

	// Record the old value.
	SDValue Old = V;

	// Merge this node's mask and our incoming mask (adjusted to account for all
	// the pshufd instructions encountered).
	SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
	for (int &M : Mask)
	M = VMask[M];
	V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

	// Check that the shuffles didn't cancel each other out. If not, we need to
	// combine to the new one.
	if (Old != V)
	// Replace the combinable shuffle with the combined one, updating all users
	// so that we re-evaluate the chain here.
	DCI.CombineTo(Old.getNode(), V, /AddTo/ true);

	return true;
	}

	/// \brief Try to combine x86 target specific shuffles.
	static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDLoc DL(N);
	MVT VT = N.getSimpleValueType();
	SmallVector<int, 4> Mask;

	unsigned Opcode = N.getOpcode();
	switch (Opcode) {
	case X86ISD::PSHUFD:
	case X86ISD::PSHUFLW:
	case X86ISD::PSHUFHW:
	Mask = getPSHUFShuffleMask(N);
	assert(Mask.size() == 4);
	break;
	case X86ISD::UNPCKL: {
	auto Op0 = N.getOperand(0);
	auto Op1 = N.getOperand(1);
	unsigned Opcode0 = Op0.getOpcode();
	unsigned Opcode1 = Op1.getOpcode();

	// Combine X86ISD::UNPCKL with 2 X86ISD::FHADD inputs into a single
	// X86ISD::FHADD. This is generated by UINT_TO_FP v2f64 scalarization.
	// TODO: Add other horizontal operations as required.
	if (VT == MVT::v2f64 && Opcode0 == Opcode1 && Opcode0 == X86ISD::FHADD)
	return DAG.getNode(Opcode0, DL, VT, Op0.getOperand(0), Op1.getOperand(0));

	// Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in
	// which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE
	// moves upper half elements into the lower half part. For example:
	//
	// t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1,
	// undef:v16i8
	// t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2
	//
	// will be combined to:
	//
	// t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1

	// This is only for 128-bit vectors. From SSE4.1 onward this combine may not
	// happen due to advanced instructions.
	if (!VT.is128BitVector())
	return SDValue();

	if (Op0.isUndef() && Opcode1 == ISD::VECTOR_SHUFFLE) {
	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask();

	unsigned NumElts = VT.getVectorNumElements();
	SmallVector<int, 8> ExpectedMask(NumElts, -1);
	std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2,
	NumElts / 2);

	auto ShufOp = Op1.getOperand(0);
	if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask))
	return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp);
	}
	return SDValue();
	}
	case X86ISD::BLENDI: {
	SDValue V0 = N->getOperand(0);
	SDValue V1 = N->getOperand(1);
	assert(VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() &&
	"Unexpected input vector types");

	// Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
	// operands and changing the mask to 1. This saves us a bunch of
	// pattern-matching possibilities related to scalar math ops in SSE/AVX.
	// x86InstrInfo knows how to commute this back after instruction selection
	// if it would help register allocation.

	// TODO: If optimizing for size or a processor that doesn't suffer from
	// partial register update stalls, this should be transformed into a MOVSD
	// instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.

	if (VT == MVT::v2f64)
	if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
	if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
	SDValue NewMask = DAG.getConstant(1, DL, MVT::i8);
	return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
	}

	return SDValue();
	}
	case X86ISD::MOVSD:
	case X86ISD::MOVSS: {
	SDValue V0 = peekThroughBitcasts(N->getOperand(0));
	SDValue V1 = peekThroughBitcasts(N->getOperand(1));
	bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode());
	bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode());
	if (isZero0 && isZero1)
	return SDValue();

	// We often lower to MOVSD/MOVSS from integer as well as native float
	// types; remove unnecessary domain-crossing bitcasts if we can to make it
	// easier to combine shuffles later on. We've already accounted for the
	// domain switching cost when we decided to lower with it.
	bool isFloat = VT.isFloatingPoint();
	bool isFloat0 = V0.getSimpleValueType().isFloatingPoint();
	bool isFloat1 = V1.getSimpleValueType().isFloatingPoint();
	if ((isFloat != isFloat0 \|\| isZero0) && (isFloat != isFloat1 \|\| isZero1)) {
	MVT NewVT = isFloat ? (X86ISD::MOVSD == Opcode ? MVT::v2i64 : MVT::v4i32)
	: (X86ISD::MOVSD == Opcode ? MVT::v2f64 : MVT::v4f32);
	V0 = DAG.getBitcast(NewVT, V0);
	V1 = DAG.getBitcast(NewVT, V1);
	return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, NewVT, V0, V1));
	}

	return SDValue();
	}
	case X86ISD::INSERTPS: {
	assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
	SDValue Op0 = N.getOperand(0);
	SDValue Op1 = N.getOperand(1);
	SDValue Op2 = N.getOperand(2);
	unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
	unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
	unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
	unsigned ZeroMask = InsertPSMask & 0xF;

	// If we zero out all elements from Op0 then we don't need to reference it.
	if (((ZeroMask \| (1u << DstIdx)) == 0xF) && !Op0.isUndef())
	return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
	DAG.getConstant(InsertPSMask, DL, MVT::i8));

	// If we zero out the element from Op1 then we don't need to reference it.
	if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
	return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
	DAG.getConstant(InsertPSMask, DL, MVT::i8));

	// Attempt to merge insertps Op1 with an inner target shuffle node.
	SmallVector<int, 8> TargetMask1;
	SmallVector<SDValue, 2> Ops1;
	if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) {
	int M = TargetMask1[SrcIdx];
	if (isUndefOrZero(M)) {
	// Zero/UNDEF insertion - zero out element and remove dependency.
	InsertPSMask \|= (1u << DstIdx);
	return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
	DAG.getConstant(InsertPSMask, DL, MVT::i8));
	}
	// Update insertps mask srcidx and reference the source input directly.
	assert(0 <= M && M < 8 && "Shuffle index out of range");
	InsertPSMask = (InsertPSMask & 0x3f) \| ((M & 0x3) << 6);
	Op1 = Ops1[M < 4 ? 0 : 1];
	return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
	DAG.getConstant(InsertPSMask, DL, MVT::i8));
	}

	// Attempt to merge insertps Op0 with an inner target shuffle node.
	SmallVector<int, 8> TargetMask0;
	SmallVector<SDValue, 2> Ops0;
	if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))
	return SDValue();

	bool Updated = false;
	bool UseInput00 = false;
	bool UseInput01 = false;
	for (int i = 0; i != 4; ++i) {
	int M = TargetMask0[i];
	if ((InsertPSMask & (1u << i)) \|\| (i == (int)DstIdx)) {
	// No change if element is already zero or the inserted element.
	continue;
	} else if (isUndefOrZero(M)) {
	// If the target mask is undef/zero then we must zero the element.
	InsertPSMask \|= (1u << i);
	Updated = true;
	continue;
	}

	// The input vector element must be inline.
	if (M != i && M != (i + 4))
	return SDValue();

	// Determine which inputs of the target shuffle we're using.
	UseInput00 \|= (0 <= M && M < 4);
	UseInput01 \|= (4 <= M);
	}

	// If we're not using both inputs of the target shuffle then use the
	// referenced input directly.
	if (UseInput00 && !UseInput01) {
	Updated = true;
	Op0 = Ops0[0];
	} else if (!UseInput00 && UseInput01) {
	Updated = true;
	Op0 = Ops0[1];
	}

	if (Updated)
	return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
	DAG.getConstant(InsertPSMask, DL, MVT::i8));

	return SDValue();
	}
	default:
	return SDValue();
	}

	// Nuke no-op shuffles that show up after combining.
	if (isNoopShuffleMask(Mask))
	return DCI.CombineTo(N.getNode(), N.getOperand(0), /AddTo/ true);

	// Look for simplifications involving one or two shuffle instructions.
	SDValue V = N.getOperand(0);
	switch (N.getOpcode()) {
	default:
	break;
	case X86ISD::PSHUFLW:
	case X86ISD::PSHUFHW:
	assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");

	if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
	return SDValue(); // We combined away this shuffle, so we're done.

	// See if this reduces to a PSHUFD which is no more expensive and can
	// combine with more operations. Note that it has to at least flip the
	// dwords as otherwise it would have been removed as a no-op.
	if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
	int DMask[] = {0, 1, 2, 3};
	int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
	DMask[DOffset + 0] = DOffset + 1;
	DMask[DOffset + 1] = DOffset + 0;
	MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
	V = DAG.getBitcast(DVT, V);
	DCI.AddToWorklist(V.getNode());
	V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
	getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
	DCI.AddToWorklist(V.getNode());
	return DAG.getBitcast(VT, V);
	}

	// Look for shuffle patterns which can be implemented as a single unpack.
	// FIXME: This doesn't handle the location of the PSHUFD generically, and
	// only works when we have a PSHUFD followed by two half-shuffles.
	if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
	(V.getOpcode() == X86ISD::PSHUFLW \|\|
	V.getOpcode() == X86ISD::PSHUFHW) &&
	V.getOpcode() != N.getOpcode() &&
	V.hasOneUse()) {
	SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
	if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
	SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
	SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
	int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
	int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
	int WordMask[8];
	for (int i = 0; i < 4; ++i) {
	WordMask[i + NOffset] = Mask[i] + NOffset;
	WordMask[i + VOffset] = VMask[i] + VOffset;
	}
	// Map the word mask through the DWord mask.
	int MappedMask[8];
	for (int i = 0; i < 8; ++i)
	MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
	if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) \|\|
	makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
	// We can replace all three shuffles with an unpack.
	V = DAG.getBitcast(VT, D.getOperand(0));
	DCI.AddToWorklist(V.getNode());
	return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
	: X86ISD::UNPCKH,
	DL, VT, V, V);
	}
	}
	}

	break;

	case X86ISD::PSHUFD:
	if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
	return NewN;

	break;
	}

	return SDValue();
	}

	/// Returns true iff the shuffle node \p N can be replaced with ADDSUB
	/// operation. If true is returned then the operands of ADDSUB operation
	/// are written to the parameters \p Opnd0 and \p Opnd1.
	///
	/// We combine shuffle to ADDSUB directly on the abstract vector shuffle nodes
	/// so it is easier to generically match. We also insert dummy vector shuffle
	/// nodes for the operands which explicitly discard the lanes which are unused
	/// by this operation to try to flow through the rest of the combiner
	/// the fact that they're unused.
	static bool isAddSub(SDNode *N, const X86Subtarget &Subtarget,
	SDValue &Opnd0, SDValue &Opnd1) {

	EVT VT = N->getValueType(0);
	if ((!Subtarget.hasSSE3() \|\| (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
	(!Subtarget.hasAVX() \|\| (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
	(!Subtarget.hasAVX512() \|\| (VT != MVT::v16f32 && VT != MVT::v8f64)))
	return false;

	// We only handle target-independent shuffles.
	// FIXME: It would be easy and harmless to use the target shuffle mask
	// extraction tool to support more.
	if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
	return false;

	ArrayRef<int> OrigMask = cast<ShuffleVectorSDNode>(N)->getMask();
	SmallVector<int, 16> Mask(OrigMask.begin(), OrigMask.end());

	SDValue V1 = N->getOperand(0);
	SDValue V2 = N->getOperand(1);

	// We require the first shuffle operand to be the FSUB node, and the second to
	// be the FADD node.
	if (V1.getOpcode() == ISD::FADD && V2.getOpcode() == ISD::FSUB) {
	ShuffleVectorSDNode::commuteMask(Mask);
	std::swap(V1, V2);
	} else if (V1.getOpcode() != ISD::FSUB \|\| V2.getOpcode() != ISD::FADD)
	return false;

	// If there are other uses of these operations we can't fold them.
	if (!V1->hasOneUse() \|\| !V2->hasOneUse())
	return false;

	// Ensure that both operations have the same operands. Note that we can
	// commute the FADD operands.
	SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
	if ((V2->getOperand(0) != LHS \|\| V2->getOperand(1) != RHS) &&
	(V2->getOperand(0) != RHS \|\| V2->getOperand(1) != LHS))
	return false;

	// We're looking for blends between FADD and FSUB nodes. We insist on these
	// nodes being lined up in a specific expected pattern.
	if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) \|\|
	isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) \|\|
	isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15}) \|\|
	isShuffleEquivalent(V1, V2, Mask, {0, 17, 2, 19, 4, 21, 6, 23,
	8, 25, 10, 27, 12, 29, 14, 31})))
	return false;

	Opnd0 = LHS;
	Opnd1 = RHS;
	return true;
	}

	/// \brief Try to combine a shuffle into a target-specific add-sub or
	/// mul-add-sub node.
	static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue Opnd0, Opnd1;
	if (!isAddSub(N, Subtarget, Opnd0, Opnd1))
	return SDValue();

	EVT VT = N->getValueType(0);
	SDLoc DL(N);

	// Try to generate X86ISD::FMADDSUB node here.
	SDValue Opnd2;
	if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2))
	return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);

	// Do not generate X86ISD::ADDSUB node for 512-bit types even though
	// the ADDSUB idiom has been successfully recognized. There are no known
	// X86 targets with 512-bit ADDSUB instructions!
	if (VT.is512BitVector())
	return SDValue();

	return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
	}

	// We are looking for a shuffle where both sources are concatenated with undef
	// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
	// if we can express this as a single-source shuffle, that's preferable.
	static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (!Subtarget.hasAVX2() \|\| !isa<ShuffleVectorSDNode>(N))
	return SDValue();

	EVT VT = N->getValueType(0);

	// We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
	if (!VT.is128BitVector() && !VT.is256BitVector())
	return SDValue();

	if (VT.getVectorElementType() != MVT::i32 &&
	VT.getVectorElementType() != MVT::i64 &&
	VT.getVectorElementType() != MVT::f32 &&
	VT.getVectorElementType() != MVT::f64)
	return SDValue();

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// Check that both sources are concats with undef.
	if (N0.getOpcode() != ISD::CONCAT_VECTORS \|\|
	N1.getOpcode() != ISD::CONCAT_VECTORS \|\| N0.getNumOperands() != 2 \|\|
	N1.getNumOperands() != 2 \|\| !N0.getOperand(1).isUndef() \|\|
	!N1.getOperand(1).isUndef())
	return SDValue();

	// Construct the new shuffle mask. Elements from the first source retain their
	// index, but elements from the second source no longer need to skip an undef.
	SmallVector<int, 8> Mask;
	int NumElts = VT.getVectorNumElements();

	ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
	for (int Elt : SVOp->getMask())
	Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));

	SDLoc DL(N);
	SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
	N1.getOperand(0));
	return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
	}

	static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDLoc dl(N);
	EVT VT = N->getValueType(0);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	// If we have legalized the vector types, look for blends of FADD and FSUB
	// nodes that we can fuse into an ADDSUB node.
	if (TLI.isTypeLegal(VT))
	if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
	return AddSub;

	// During Type Legalization, when promoting illegal vector types,
	// the backend might introduce new shuffle dag nodes and bitcasts.
	//
	// This code performs the following transformation:
	// fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
	// (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
	//
	// We do this only if both the bitcast and the BINOP dag nodes have
	// one use. Also, perform this transformation only if the new binary
	// operation is legal. This is to avoid introducing dag nodes that
	// potentially need to be further expanded (or custom lowered) into a
	// less optimal sequence of dag nodes.
	if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
	N->getOpcode() == ISD::VECTOR_SHUFFLE &&
	N->getOperand(0).getOpcode() == ISD::BITCAST &&
	N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	SDValue BC0 = N0.getOperand(0);
	EVT SVT = BC0.getValueType();
	unsigned Opcode = BC0.getOpcode();
	unsigned NumElts = VT.getVectorNumElements();

	if (BC0.hasOneUse() && SVT.isVector() &&
	SVT.getVectorNumElements() * 2 == NumElts &&
	TLI.isOperationLegal(Opcode, VT)) {
	bool CanFold = false;
	switch (Opcode) {
	default : break;
	case ISD::ADD:
	case ISD::SUB:
	case ISD::MUL:
	// isOperationLegal lies for integer ops on floating point types.
	CanFold = VT.isInteger();
	break;
	case ISD::FADD:
	case ISD::FSUB:
	case ISD::FMUL:
	// isOperationLegal lies for floating point ops on integer types.
	CanFold = VT.isFloatingPoint();
	break;
	}

	unsigned SVTNumElts = SVT.getVectorNumElements();
	ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
	for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
	CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
	for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
	CanFold = SVOp->getMaskElt(i) < 0;

	if (CanFold) {
	SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
	SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
	SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
	return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask());
	}
	}
	}

	// Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
	// load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
	// consecutive, non-overlapping, and in the right order.
	SmallVector<SDValue, 16> Elts;
	for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
	if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) {
	Elts.push_back(Elt);
	continue;
	}
	Elts.clear();
	break;
	}

	if (Elts.size() == VT.getVectorNumElements())
	if (SDValue LD =
	EltsFromConsecutiveLoads(VT, Elts, dl, DAG, Subtarget, true))
	return LD;

	// For AVX2, we sometimes want to combine
	// (vector_shuffle <mask> (concat_vectors t1, undef)
	// (concat_vectors t2, undef))
	// Into:
	// (vector_shuffle <mask> (concat_vectors t1, t2), undef)
	// Since the latter can be efficiently lowered with VPERMD/VPERMQ
	if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
	return ShufConcat;

	if (isTargetShuffle(N->getOpcode())) {
	SDValue Op(N, 0);
	if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
	return Shuffle;

	// Try recursively combining arbitrary sequences of x86 shuffle
	// instructions into higher-order shuffles. We do this after combining
	// specific PSHUF instruction sequences into their minimal form so that we
	// can evaluate how many specialized shuffle instructions are involved in
	// a particular chain.
	SmallVector<int, 1> NonceMask; // Just a placeholder.
	NonceMask.push_back(0);
	if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
	/Depth/ 1, /HasVarMask/ false, DAG,
	DCI, Subtarget))
	return SDValue(); // This routine will use CombineTo to replace N.
	}

	return SDValue();
	}

	/// Check if a vector extract from a target-specific shuffle of a load can be
	/// folded into a single element load.
	/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
	/// shuffles have been custom lowered so we need to handle those here.
	static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	SDValue InVec = N->getOperand(0);
	SDValue EltNo = N->getOperand(1);
	EVT EltVT = N->getValueType(0);

	if (!isa<ConstantSDNode>(EltNo))
	return SDValue();

	EVT OriginalVT = InVec.getValueType();

	// Peek through bitcasts, don't duplicate a load with other uses.
	InVec = peekThroughOneUseBitcasts(InVec);

	EVT CurrentVT = InVec.getValueType();
	if (!CurrentVT.isVector() \|\|
	CurrentVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
	return SDValue();

	if (!isTargetShuffle(InVec.getOpcode()))
	return SDValue();

	// Don't duplicate a load with other uses.
	if (!InVec.hasOneUse())
	return SDValue();

	SmallVector<int, 16> ShuffleMask;
	SmallVector<SDValue, 2> ShuffleOps;
	bool UnaryShuffle;
	if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
	ShuffleOps, ShuffleMask, UnaryShuffle))
	return SDValue();

	// Select the input vector, guarding against out of range extract vector.
	unsigned NumElems = CurrentVT.getVectorNumElements();
	int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
	int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt];

	if (Idx == SM_SentinelZero)
	return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
	: DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
	if (Idx == SM_SentinelUndef)
	return DAG.getUNDEF(EltVT);

	assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range");
	SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0]
	: ShuffleOps[1];

	// If inputs to shuffle are the same for both ops, then allow 2 uses
	unsigned AllowedUses =
	(ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;

	if (LdNode.getOpcode() == ISD::BITCAST) {
	// Don't duplicate a load with other uses.
	if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
	return SDValue();

	AllowedUses = 1; // only allow 1 load use if we have a bitcast
	LdNode = LdNode.getOperand(0);
	}

	if (!ISD::isNormalLoad(LdNode.getNode()))
	return SDValue();

	LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);

	if (!LN0 \|\|!LN0->hasNUsesOfValue(AllowedUses, 0) \|\| LN0->isVolatile())
	return SDValue();

	// If there's a bitcast before the shuffle, check if the load type and
	// alignment is valid.
	unsigned Align = LN0->getAlignment();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
	EltVT.getTypeForEVT(*DAG.getContext()));

	if (NewAlign > Align \|\| !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
	return SDValue();

	// All checks match so transform back to vector_shuffle so that DAG combiner
	// can finish the job
	SDLoc dl(N);

	// Create shuffle node taking into account the case that its a unary shuffle
	SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1];
	Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle,
	ShuffleMask);
	Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
	EltNo);
	}

	// Try to match patterns such as
	// (i16 bitcast (v16i1 x))
	// ->
	// (i16 movmsk (16i8 sext (v16i1 x)))
	// before the illegal vector is scalarized on subtargets that don't have legal
	// vxi1 types.
	static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
	const X86Subtarget &Subtarget) {
	EVT VT = BitCast.getValueType();
	SDValue N0 = BitCast.getOperand(0);
	EVT VecVT = N0->getValueType(0);

	if (!VT.isScalarInteger() \|\| !VecVT.isSimple())
	return SDValue();

	// With AVX512 vxi1 types are legal and we prefer using k-regs.
	// MOVMSK is supported in SSE2 or later.
	if (Subtarget.hasAVX512() \|\| !Subtarget.hasSSE2())
	return SDValue();

	// There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
	// v8f64. So all legal 128-bit and 256-bit vectors are covered except for
	// v8i16 and v16i16.
	// For these two cases, we can shuffle the upper element bytes to a
	// consecutive sequence at the start of the vector and treat the results as
	// v16i8 or v32i8, and for v61i8 this is the preferable solution. However,
	// for v16i16 this is not the case, because the shuffle is expensive, so we
	// avoid sign-extending to this type entirely.
	// For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
	// (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
	MVT SExtVT;
	MVT FPCastVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
	switch (VecVT.getSimpleVT().SimpleTy) {
	default:
	return SDValue();
	case MVT::v2i1:
	SExtVT = MVT::v2i64;
	FPCastVT = MVT::v2f64;
	break;
	case MVT::v4i1:
	SExtVT = MVT::v4i32;
	FPCastVT = MVT::v4f32;
	// For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
	// sign-extend to a 256-bit operation to avoid truncation.
	if (N0->getOpcode() == ISD::SETCC &&
	N0->getOperand(0)->getValueType(0).is256BitVector() &&
	Subtarget.hasInt256()) {
	SExtVT = MVT::v4i64;
	FPCastVT = MVT::v4f64;
	}
	break;
	case MVT::v8i1:
	SExtVT = MVT::v8i16;
	// For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
	// sign-extend to a 256-bit operation to match the compare.
	// If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
	// 256-bit because the shuffle is cheaper than sign extending the result of
	// the compare.
	if (N0->getOpcode() == ISD::SETCC &&
	N0->getOperand(0)->getValueType(0).is256BitVector() &&
	Subtarget.hasInt256()) {
	SExtVT = MVT::v8i32;
	FPCastVT = MVT::v8f32;
	}
	break;
	case MVT::v16i1:
	SExtVT = MVT::v16i8;
	// For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
	// it is not profitable to sign-extend to 256-bit because this will
	// require an extra cross-lane shuffle which is more expensive than
	// truncating the result of the compare to 128-bits.
	break;
	case MVT::v32i1:
	// TODO: Handle pre-AVX2 cases by splitting to two v16i1's.
	if (!Subtarget.hasInt256())
	return SDValue();
	SExtVT = MVT::v32i8;
	break;
	};

	SDLoc DL(BitCast);
	SDValue V = DAG.getSExtOrTrunc(N0, DL, SExtVT);
	if (SExtVT == MVT::v8i16) {
	V = DAG.getBitcast(MVT::v16i8, V);
	V = DAG.getVectorShuffle(
	MVT::v16i8, DL, V, DAG.getUNDEF(MVT::v16i8),
	{0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1});
	} else
	assert(SExtVT.getScalarType() != MVT::i16 &&
	"Vectors of i16 must be shuffled");
	if (FPCastVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
	V = DAG.getBitcast(FPCastVT, V);
	V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
	return DAG.getZExtOrTrunc(V, DL, VT);
	}

	static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT SrcVT = N0.getValueType();

	// Try to match patterns such as
	// (i16 bitcast (v16i1 x))
	// ->
	// (i16 movmsk (16i8 sext (v16i1 x)))
	// before the setcc result is scalarized on subtargets that don't have legal
	// vxi1 types.
	if (DCI.isBeforeLegalize())
	if (SDValue V = combineBitcastvxi1(DAG, SDValue(N, 0), Subtarget))
	return V;
	// Since MMX types are special and don't usually play with other vector types,
	// it's better to handle them early to be sure we emit efficient code by
	// avoiding store-load conversions.

	// Detect bitcasts between i32 to x86mmx low word.
	if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR &&
	SrcVT == MVT::v2i32 && isNullConstant(N0.getOperand(1))) {
	SDValue N00 = N0->getOperand(0);
	if (N00.getValueType() == MVT::i32)
	return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00);
	}

	// Detect bitcasts between element or subvector extraction to x86mmx.
	if (VT == MVT::x86mmx &&
	(N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT \|\|
	N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
	isNullConstant(N0.getOperand(1))) {
	SDValue N00 = N0->getOperand(0);
	if (N00.getValueType().is128BitVector())
	return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
	DAG.getBitcast(MVT::v2i64, N00));
	}

	// Detect bitcasts from FP_TO_SINT to x86mmx.
	if (VT == MVT::x86mmx && SrcVT == MVT::v2i32 &&
	N0.getOpcode() == ISD::FP_TO_SINT) {
	SDLoc DL(N0);
	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
	DAG.getUNDEF(MVT::v2i32));
	return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
	DAG.getBitcast(MVT::v2i64, Res));
	}

	// Convert a bitcasted integer logic operation that has one bitcasted
	// floating-point operand into a floating-point logic operation. This may
	// create a load of a constant, but that is cheaper than materializing the
	// constant in an integer register and transferring it to an SSE register or
	// transferring the SSE operand to integer register and back.
	unsigned FPOpcode;
	switch (N0.getOpcode()) {
	case ISD::AND: FPOpcode = X86ISD::FAND; break;
	case ISD::OR: FPOpcode = X86ISD::FOR; break;
	case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
	default: return SDValue();
	}

	if (!((Subtarget.hasSSE1() && VT == MVT::f32) \|\|
	(Subtarget.hasSSE2() && VT == MVT::f64)))
	return SDValue();

	SDValue LogicOp0 = N0.getOperand(0);
	SDValue LogicOp1 = N0.getOperand(1);
	SDLoc DL0(N0);

	// bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
	if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
	LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT &&
	!isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
	SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
	return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
	}
	// bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
	if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
	LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT &&
	!isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
	SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
	return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
	}

	return SDValue();
	}

	// Match a binop + shuffle pyramid that represents a horizontal reduction over
	// the elements of a vector.
	// Returns the vector that is being reduced on, or SDValue() if a reduction
	// was not matched.
	static SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType BinOp) {
	// The pattern must end in an extract from index 0.
	if ((Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) \|\|
	!isNullConstant(Extract->getOperand(1)))
	return SDValue();

	unsigned Stages =
	Log2_32(Extract->getOperand(0).getValueType().getVectorNumElements());

	SDValue Op = Extract->getOperand(0);
	// At each stage, we're looking for something that looks like:
	// %s = shufflevector <8 x i32> %op, <8 x i32> undef,
	// <8 x i32> <i32 2, i32 3, i32 undef, i32 undef,
	// i32 undef, i32 undef, i32 undef, i32 undef>
	// %a = binop <8 x i32> %op, %s
	// Where the mask changes according to the stage. E.g. for a 3-stage pyramid,
	// we expect something like:
	// <4,5,6,7,u,u,u,u>
	// <2,3,u,u,u,u,u,u>
	// <1,u,u,u,u,u,u,u>
	for (unsigned i = 0; i < Stages; ++i) {
	if (Op.getOpcode() != BinOp)
	return SDValue();

	ShuffleVectorSDNode *Shuffle =
	dyn_cast<ShuffleVectorSDNode>(Op.getOperand(0).getNode());
	if (Shuffle) {
	Op = Op.getOperand(1);
	} else {
	Shuffle = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(1).getNode());
	Op = Op.getOperand(0);
	}

	// The first operand of the shuffle should be the same as the other operand
	// of the add.
	if (!Shuffle \|\| (Shuffle->getOperand(0) != Op))
	return SDValue();

	// Verify the shuffle has the expected (at this stage of the pyramid) mask.
	for (int Index = 0, MaskEnd = 1 << i; Index < MaskEnd; ++Index)
	if (Shuffle->getMaskElt(Index) != MaskEnd + Index)
	return SDValue();
	}

	return Op;
	}

	// Given a select, detect the following pattern:
	// 1: %2 = zext <N x i8> %0 to <N x i32>
	// 2: %3 = zext <N x i8> %1 to <N x i32>
	// 3: %4 = sub nsw <N x i32> %2, %3
	// 4: %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]
	// 5: %6 = sub nsw <N x i32> zeroinitializer, %4
	// 6: %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6
	// This is useful as it is the input into a SAD pattern.
	static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
	SDValue &Op1) {
	// Check the condition of the select instruction is greater-than.
	SDValue SetCC = Select->getOperand(0);
	if (SetCC.getOpcode() != ISD::SETCC)
	return false;
	ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
	if (CC != ISD::SETGT && CC != ISD::SETLT)
	return false;

	SDValue SelectOp1 = Select->getOperand(1);
	SDValue SelectOp2 = Select->getOperand(2);

	// The following instructions assume SelectOp1 is the subtraction operand
	// and SelectOp2 is the negation operand.
	// In the case of SETLT this is the other way around.
	if (CC == ISD::SETLT)
	std::swap(SelectOp1, SelectOp2);

	// The second operand of the select should be the negation of the first
	// operand, which is implemented as 0 - SelectOp1.
	if (!(SelectOp2.getOpcode() == ISD::SUB &&
	ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) &&
	SelectOp2.getOperand(1) == SelectOp1))
	return false;

	// The first operand of SetCC is the first operand of the select, which is the
	// difference between the two input vectors.
	if (SetCC.getOperand(0) != SelectOp1)
	return false;

	// In SetLT case, The second operand of the comparison can be either 1 or 0.
	APInt SplatVal;
	if ((CC == ISD::SETLT) &&
	!((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal) &&
	SplatVal == 1) \|\|
	(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()))))
	return false;

	// In SetGT case, The second operand of the comparison can be either -1 or 0.
	if ((CC == ISD::SETGT) &&
	!(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) \|\|
	ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
	return false;

	// The first operand of the select is the difference between the two input
	// vectors.
	if (SelectOp1.getOpcode() != ISD::SUB)
	return false;

	Op0 = SelectOp1.getOperand(0);
	Op1 = SelectOp1.getOperand(1);

	// Check if the operands of the sub are zero-extended from vectors of i8.
	if (Op0.getOpcode() != ISD::ZERO_EXTEND \|\|
	Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 \|\|
	Op1.getOpcode() != ISD::ZERO_EXTEND \|\|
	Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
	return false;

	return true;
	}

	// Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
	// to these zexts.
	static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
	const SDValue &Zext1, const SDLoc &DL) {

	// Find the appropriate width for the PSADBW.
	EVT InVT = Zext0.getOperand(0).getValueType();
	unsigned RegSize = std::max(128u, InVT.getSizeInBits());

	// "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
	// fill in the missing vector elements with 0.
	unsigned NumConcat = RegSize / InVT.getSizeInBits();
	SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
	Ops[0] = Zext0.getOperand(0);
	MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
	SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
	Ops[0] = Zext1.getOperand(0);
	SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);

	// Actually build the SAD
	MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
	return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1);
	}

	// Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK.
	static SDValue combineHorizontalPredicateResult(SDNode *Extract,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// Bail without SSE2 or with AVX512VL (which uses predicate registers).
	if (!Subtarget.hasSSE2() \|\| Subtarget.hasVLX())
	return SDValue();

	EVT ExtractVT = Extract->getValueType(0);
	unsigned BitWidth = ExtractVT.getSizeInBits();
	if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
	ExtractVT != MVT::i8)
	return SDValue();

	// Check for OR(any_of) and AND(all_of) horizontal reduction patterns.
	for (ISD::NodeType Op : {ISD::OR, ISD::AND}) {
	SDValue Match = matchBinOpReduction(Extract, Op);
	if (!Match)
	continue;

	// EXTRACT_VECTOR_ELT can require implicit extension of the vector element
	// which we can't support here for now.
	if (Match.getScalarValueSizeInBits() != BitWidth)
	continue;

	// We require AVX2 for PMOVMSKB for v16i16/v32i8;
	unsigned MatchSizeInBits = Match.getValueSizeInBits();
	if (!(MatchSizeInBits == 128 \|\|
	(MatchSizeInBits == 256 &&
	((Subtarget.hasAVX() && BitWidth >= 32) \|\| Subtarget.hasAVX2()))))
	return SDValue();

	// Don't bother performing this for 2-element vectors.
	if (Match.getValueType().getVectorNumElements() <= 2)
	return SDValue();

	// Check that we are extracting a reduction of all sign bits.
	if (DAG.ComputeNumSignBits(Match) != BitWidth)
	return SDValue();

	// For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
	MVT MaskVT;
	if (64 == BitWidth \|\| 32 == BitWidth)
	MaskVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
	MatchSizeInBits / BitWidth);
	else
	MaskVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);

	APInt CompareBits;
	ISD::CondCode CondCode;
	if (Op == ISD::OR) {
	// any_of -> MOVMSK != 0
	CompareBits = APInt::getNullValue(32);
	CondCode = ISD::CondCode::SETNE;
	} else {
	// all_of -> MOVMSK == ((1 << NumElts) - 1)
	CompareBits = APInt::getLowBitsSet(32, MaskVT.getVectorNumElements());
	CondCode = ISD::CondCode::SETEQ;
	}

	// Perform the select as i32/i64 and then truncate to avoid partial register
	// stalls.
	unsigned ResWidth = std::max(BitWidth, 32u);
	EVT ResVT = EVT::getIntegerVT(*DAG.getContext(), ResWidth);
	SDLoc DL(Extract);
	SDValue Zero = DAG.getConstant(0, DL, ResVT);
	SDValue Ones = DAG.getAllOnesConstant(DL, ResVT);
	SDValue Res = DAG.getBitcast(MaskVT, Match);
	Res = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Res);
	Res = DAG.getSelectCC(DL, Res, DAG.getConstant(CompareBits, DL, MVT::i32),
	Ones, Zero, CondCode);
	return DAG.getSExtOrTrunc(Res, DL, ExtractVT);
	}

	return SDValue();
	}

	static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// PSADBW is only supported on SSE2 and up.
	if (!Subtarget.hasSSE2())
	return SDValue();

	// Verify the type we're extracting from is any integer type above i16.
	EVT VT = Extract->getOperand(0).getValueType();
	if (!VT.isSimple() \|\| !(VT.getVectorElementType().getSizeInBits() > 16))
	return SDValue();

	unsigned RegSize = 128;
	if (Subtarget.hasBWI())
	RegSize = 512;
	else if (Subtarget.hasAVX2())
	RegSize = 256;

	// We handle upto v16i* for SSE2 / v32i* for AVX2 / v64i* for AVX512.
	// TODO: We should be able to handle larger vectors by splitting them before
	// feeding them into several SADs, and then reducing over those.
	if (RegSize / VT.getVectorNumElements() < 8)
	return SDValue();

	// Match shuffle + add pyramid.
	SDValue Root = matchBinOpReduction(Extract, ISD::ADD);

	// The operand is expected to be zero extended from i8
	// (verified in detectZextAbsDiff).
	// In order to convert to i64 and above, additional any/zero/sign
	// extend is expected.
	// The zero extend from 32 bit has no mathematical effect on the result.
	// Also the sign extend is basically zero extend
	// (extends the sign bit which is zero).
	// So it is correct to skip the sign/zero extend instruction.
	if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND \|\|
	Root.getOpcode() == ISD::ZERO_EXTEND \|\|
	Root.getOpcode() == ISD::ANY_EXTEND))
	Root = Root.getOperand(0);

	// If there was a match, we want Root to be a select that is the root of an
	// abs-diff pattern.
	if (!Root \|\| (Root.getOpcode() != ISD::VSELECT))
	return SDValue();

	// Check whether we have an abs-diff pattern feeding into the select.
	SDValue Zext0, Zext1;
	if (!detectZextAbsDiff(Root, Zext0, Zext1))
	return SDValue();

	// Create the SAD instruction.
	SDLoc DL(Extract);
	SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL);

	// If the original vector was wider than 8 elements, sum over the results
	// in the SAD vector.
	unsigned Stages = Log2_32(VT.getVectorNumElements());
	MVT SadVT = SAD.getSimpleValueType();
	if (Stages > 3) {
	unsigned SadElems = SadVT.getVectorNumElements();

	for(unsigned i = Stages - 3; i > 0; --i) {
	SmallVector<int, 16> Mask(SadElems, -1);
	for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
	Mask[j] = MaskEnd + j;

	SDValue Shuffle =
	DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
	SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
	}
	}

	MVT Type = Extract->getSimpleValueType(0);
	unsigned TypeSizeInBits = Type.getSizeInBits();
	// Return the lowest TypeSizeInBits bits.
	MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits);
	SAD = DAG.getNode(ISD::BITCAST, DL, ResVT, SAD);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD,
	Extract->getOperand(1));
	}

	// Attempt to peek through a target shuffle and extract the scalar from the
	// source.
	static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	SDValue Src = N->getOperand(0);
	SDValue Idx = N->getOperand(1);

	EVT VT = N->getValueType(0);
	EVT SrcVT = Src.getValueType();
	EVT SrcSVT = SrcVT.getVectorElementType();
	unsigned NumSrcElts = SrcVT.getVectorNumElements();

	// Don't attempt this for boolean mask vectors or unknown extraction indices.
	if (SrcSVT == MVT::i1 \|\| !isa<ConstantSDNode>(Idx))
	return SDValue();

	// Resolve the target shuffle inputs and mask.
	SmallVector<int, 16> Mask;
	SmallVector<SDValue, 2> Ops;
	if (!resolveTargetShuffleInputs(peekThroughBitcasts(Src), Ops, Mask, DAG))
	return SDValue();

	// Attempt to narrow/widen the shuffle mask to the correct size.
	if (Mask.size() != NumSrcElts) {
	if ((NumSrcElts % Mask.size()) == 0) {
	SmallVector<int, 16> ScaledMask;
	int Scale = NumSrcElts / Mask.size();
	scaleShuffleMask(Scale, Mask, ScaledMask);
	Mask = std::move(ScaledMask);
	} else if ((Mask.size() % NumSrcElts) == 0) {
	SmallVector<int, 16> WidenedMask;
	while (Mask.size() > NumSrcElts &&
	canWidenShuffleElements(Mask, WidenedMask))
	Mask = std::move(WidenedMask);
	// TODO - investigate support for wider shuffle masks with known upper
	// undef/zero elements for implicit zero-extension.
	}
	}

	// Check if narrowing/widening failed.
	if (Mask.size() != NumSrcElts)
	return SDValue();

	int SrcIdx = Mask[N->getConstantOperandVal(1)];
	SDLoc dl(N);

	// If the shuffle source element is undef/zero then we can just accept it.
	if (SrcIdx == SM_SentinelUndef)
	return DAG.getUNDEF(VT);

	if (SrcIdx == SM_SentinelZero)
	return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
	: DAG.getConstant(0, dl, VT);

	SDValue SrcOp = Ops[SrcIdx / Mask.size()];
	SrcOp = DAG.getBitcast(SrcVT, SrcOp);
	SrcIdx = SrcIdx % Mask.size();

	// We can only extract other elements from 128-bit vectors and in certain
	// circumstances, depending on SSE-level.
	// TODO: Investigate using extract_subvector for larger vectors.
	// TODO: Investigate float/double extraction if it will be just stored.
	if ((SrcVT == MVT::v4i32 \|\| SrcVT == MVT::v2i64) &&
	((SrcIdx == 0 && Subtarget.hasSSE2()) \|\| Subtarget.hasSSE41())) {
	assert(SrcSVT == VT && "Unexpected extraction type");
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp,
	DAG.getIntPtrConstant(SrcIdx, dl));
	}

	if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) \|\|
	(SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) {
	assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() &&
	"Unexpected extraction type");
	unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
	SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
	DAG.getIntPtrConstant(SrcIdx, dl));
	SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, ExtOp,
	DAG.getValueType(SrcSVT));
	return DAG.getZExtOrTrunc(Assert, dl, VT);
	}

	return SDValue();
	}

	/// Detect vector gather/scatter index generation and convert it from being a
	/// bunch of shuffles and extracts into a somewhat faster sequence.
	/// For i686, the best sequence is apparently storing the value and loading
	/// scalars back, while for x64 we should use 64-bit extracts and shifts.
	static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
	return NewOp;

	if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
	return NewOp;

	SDValue InputVector = N->getOperand(0);
	SDValue EltIdx = N->getOperand(1);

	EVT SrcVT = InputVector.getValueType();
	EVT VT = N->getValueType(0);
	SDLoc dl(InputVector);

	// Detect mmx extraction of all bits as a i64. It works better as a bitcast.
	if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
	VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
	SDValue MMXSrc = InputVector.getOperand(0);

	// The bitcast source is a direct mmx result.
	if (MMXSrc.getValueType() == MVT::x86mmx)
	return DAG.getBitcast(VT, InputVector);
	}

	// Detect mmx to i32 conversion through a v2i32 elt extract.
	if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
	VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
	SDValue MMXSrc = InputVector.getOperand(0);

	// The bitcast source is a direct mmx result.
	if (MMXSrc.getValueType() == MVT::x86mmx)
	return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
	}

	if (VT == MVT::i1 && InputVector.getOpcode() == ISD::BITCAST &&
	isa<ConstantSDNode>(EltIdx) &&
	isa<ConstantSDNode>(InputVector.getOperand(0))) {
	uint64_t ExtractedElt = N->getConstantOperandVal(1);
	uint64_t InputValue = InputVector.getConstantOperandVal(0);
	uint64_t Res = (InputValue >> ExtractedElt) & 1;
	return DAG.getConstant(Res, dl, MVT::i1);
	}

	// Check whether this extract is the root of a sum of absolute differences
	// pattern. This has to be done here because we really want it to happen
	// pre-legalization,
	if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
	return SAD;

	// Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
	if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))
	return Cmp;

	// Only operate on vectors of 4 elements, where the alternative shuffling
	// gets to be more expensive.
	if (SrcVT != MVT::v4i32)
	return SDValue();

	// Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
	// single use which is a sign-extend or zero-extend, and all elements are
	// used.
	SmallVector<SDNode *, 4> Uses;
	unsigned ExtractedElements = 0;
	for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
	UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
	if (UI.getUse().getResNo() != InputVector.getResNo())
	return SDValue();

	SDNode Extract = UI;
	if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	return SDValue();

	if (Extract->getValueType(0) != MVT::i32)
	return SDValue();
	if (!Extract->hasOneUse())
	return SDValue();
	if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
	Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
	return SDValue();
	if (!isa<ConstantSDNode>(Extract->getOperand(1)))
	return SDValue();

	// Record which element was extracted.
	ExtractedElements \|= 1 << Extract->getConstantOperandVal(1);
	Uses.push_back(Extract);
	}

	// If not all the elements were used, this may not be worthwhile.
	if (ExtractedElements != 15)
	return SDValue();

	// Ok, we've now decided to do the transformation.
	// If 64-bit shifts are legal, use the extract-shift sequence,
	// otherwise bounce the vector off the cache.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDValue Vals[4];

	if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
	SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector);
	auto &DL = DAG.getDataLayout();
	EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(DL);
	SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
	DAG.getConstant(0, dl, VecIdxTy));
	SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
	DAG.getConstant(1, dl, VecIdxTy));

	SDValue ShAmt = DAG.getConstant(
	32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64, DL));
	Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
	Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
	DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
	Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
	Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
	DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
	} else {
	// Store the value to a temporary stack slot.
	SDValue StackPtr = DAG.CreateStackTemporary(SrcVT);
	SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
	MachinePointerInfo());

	EVT ElementType = SrcVT.getVectorElementType();
	unsigned EltSize = ElementType.getSizeInBits() / 8;

	// Replace each use (extract) with a load of the appropriate element.
	for (unsigned i = 0; i < 4; ++i) {
	uint64_t Offset = EltSize * i;
	auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
	SDValue OffsetVal = DAG.getConstant(Offset, dl, PtrVT);

	SDValue ScalarAddr =
	DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal);

	// Load the scalar.
	Vals[i] =
	DAG.getLoad(ElementType, dl, Ch, ScalarAddr, MachinePointerInfo());
	}
	}

	// Replace the extracts
	for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
	UE = Uses.end(); UI != UE; ++UI) {
	SDNode Extract = UI;

	uint64_t IdxVal = Extract->getConstantOperandVal(1);
	DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
	}

	// The replacement was made in place; don't return anything.
	return SDValue();
	}

	// TODO - merge with combineExtractVectorElt once it can handle the implicit
	// zero-extension of X86ISD::PINSRW/X86ISD::PINSRB in:
	// XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and
	// combineBasicSADPattern.
	static SDValue combineExtractVectorElt_SSE(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	return combineExtractWithShuffle(N, DAG, DCI, Subtarget);
	}

	/// If a vector select has an operand that is -1 or 0, try to simplify the
	/// select to a bitwise logic operation.
	static SDValue
	combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDValue Cond = N->getOperand(0);
	SDValue LHS = N->getOperand(1);
	SDValue RHS = N->getOperand(2);
	EVT VT = LHS.getValueType();
	EVT CondVT = Cond.getValueType();
	SDLoc DL(N);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	if (N->getOpcode() != ISD::VSELECT)
	return SDValue();

	assert(CondVT.isVector() && "Vector select expects a vector selector!");

	bool FValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
	// Check if the first operand is all zeros and Cond type is vXi1.
	// This situation only applies to avx512.
	if (FValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() &&
	CondVT.getVectorElementType() == MVT::i1) {
	// Invert the cond to not(cond) : xor(op,allones)=not(op)
	SDValue CondNew = DAG.getNode(ISD::XOR, DL, CondVT, Cond,
	DAG.getAllOnesConstant(DL, CondVT));
	// Vselect cond, op1, op2 = Vselect not(cond), op2, op1
	return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
	}

	// To use the condition operand as a bitwise mask, it must have elements that
	// are the same size as the select elements. Ie, the condition operand must
	// have already been promoted from the IR select condition type <N x i1>.
	// Don't check if the types themselves are equal because that excludes
	// vector floating-point selects.
	if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
	return SDValue();

	bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
	FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());

	// Try to invert the condition if true value is not all 1s and false value is
	// not all 0s.
	if (!TValIsAllOnes && !FValIsAllZeros &&
	// Check if the selector will be produced by CMPP/PCMP.
	Cond.getOpcode() == ISD::SETCC &&
	// Check if SETCC has already been promoted.
	TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
	CondVT) {
	bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
	bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());

	if (TValIsAllZeros \|\| FValIsAllOnes) {
	SDValue CC = Cond.getOperand(2);
	ISD::CondCode NewCC =
	ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
	Cond.getOperand(0).getValueType().isInteger());
	Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
	NewCC);
	std::swap(LHS, RHS);
	TValIsAllOnes = FValIsAllOnes;
	FValIsAllZeros = TValIsAllZeros;
	}
	}

	// vselect Cond, 111..., 000... -> Cond
	if (TValIsAllOnes && FValIsAllZeros)
	return DAG.getBitcast(VT, Cond);

	if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(CondVT))
	return SDValue();

	// vselect Cond, 111..., X -> or Cond, X
	if (TValIsAllOnes) {
	SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
	SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
	return DAG.getBitcast(VT, Or);
	}

	// vselect Cond, X, 000... -> and Cond, X
	if (FValIsAllZeros) {
	SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
	SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
	return DAG.getBitcast(VT, And);
	}

	return SDValue();
	}

	static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
	SDValue Cond = N->getOperand(0);
	SDValue LHS = N->getOperand(1);
	SDValue RHS = N->getOperand(2);
	SDLoc DL(N);

	auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
	auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
	if (!TrueC \|\| !FalseC)
	return SDValue();

	// Don't do this for crazy integer types.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType()))
	return SDValue();

	// If this is efficiently invertible, canonicalize the LHSC/RHSC values
	// so that TrueC (the true value) is larger than FalseC.
	bool NeedsCondInvert = false;
	if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
	// Efficiently invertible.
	(Cond.getOpcode() == ISD::SETCC \|\| // setcc -> invertible.
	(Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible.
	isa<ConstantSDNode>(Cond.getOperand(1))))) {
	NeedsCondInvert = true;
	std::swap(TrueC, FalseC);
	}

	// Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0.
	if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
	if (NeedsCondInvert) // Invert the condition if needed.
	Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
	DAG.getConstant(1, DL, Cond.getValueType()));

	// Zero extend the condition if needed.
	Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);

	unsigned ShAmt = TrueC->getAPIntValue().logBase2();
	return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
	DAG.getConstant(ShAmt, DL, MVT::i8));
	}

	// Optimize cases that will turn into an LEA instruction. This requires
	// an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
	if (N->getValueType(0) == MVT::i32 \|\| N->getValueType(0) == MVT::i64) {
	uint64_t Diff = TrueC->getZExtValue() - FalseC->getZExtValue();
	if (N->getValueType(0) == MVT::i32)
	Diff = (unsigned)Diff;

	bool IsFastMultiplier = false;
	if (Diff < 10) {
	switch ((unsigned char)Diff) {
	default:
	break;
	case 1: // result = add base, cond
	case 2: // result = lea base( , cond*2)
	case 3: // result = lea base(cond, cond*2)
	case 4: // result = lea base( , cond*4)
	case 5: // result = lea base(cond, cond*4)
	case 8: // result = lea base( , cond*8)
	case 9: // result = lea base(cond, cond*8)
	IsFastMultiplier = true;
	break;
	}
	}

	if (IsFastMultiplier) {
	APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
	if (NeedsCondInvert) // Invert the condition if needed.
	Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
	DAG.getConstant(1, DL, Cond.getValueType()));

	// Zero extend the condition if needed.
	Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond);
	// Scale the condition by the difference.
	if (Diff != 1)
	Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
	DAG.getConstant(Diff, DL, Cond.getValueType()));

	// Add the base if non-zero.
	if (FalseC->getAPIntValue() != 0)
	Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
	SDValue(FalseC, 0));
	return Cond;
	}
	}

	return SDValue();
	}

	// If this is a bitcasted op that can be represented as another type, push the
	// the bitcast to the inputs. This allows more opportunities for pattern
	// matching masked instructions. This is called when we know that the operation
	// is used as one of the inputs of a vselect.
	static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	// Make sure we have a bitcast.
	if (OrigOp.getOpcode() != ISD::BITCAST)
	return false;

	SDValue Op = OrigOp.getOperand(0);

	// If the operation is used by anything other than the bitcast, we shouldn't
	// do this combine as that would replicate the operation.
	if (!Op.hasOneUse())
	return false;

	MVT VT = OrigOp.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	SDLoc DL(Op.getNode());

	auto BitcastAndCombineShuffle = [&](unsigned Opcode, SDValue Op0, SDValue Op1,
	SDValue Op2) {
	Op0 = DAG.getBitcast(VT, Op0);
	DCI.AddToWorklist(Op0.getNode());
	Op1 = DAG.getBitcast(VT, Op1);
	DCI.AddToWorklist(Op1.getNode());
	DCI.CombineTo(OrigOp.getNode(),
	DAG.getNode(Opcode, DL, VT, Op0, Op1, Op2));
	return true;
	};

	unsigned Opcode = Op.getOpcode();
	switch (Opcode) {
	case X86ISD::PALIGNR:
	// PALIGNR can be converted to VALIGND/Q for 128-bit vectors.
	if (!VT.is128BitVector())
	return false;
	Opcode = X86ISD::VALIGN;
	LLVM_FALLTHROUGH;
	case X86ISD::VALIGN: {
	if (EltVT != MVT::i32 && EltVT != MVT::i64)
	return false;
	uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
	MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
	unsigned ShiftAmt = Imm * OpEltVT.getSizeInBits();
	unsigned EltSize = EltVT.getSizeInBits();
	// Make sure we can represent the same shift with the new VT.
	if ((ShiftAmt % EltSize) != 0)
	return false;
	Imm = ShiftAmt / EltSize;
	return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
	DAG.getConstant(Imm, DL, MVT::i8));
	}
	case X86ISD::SHUF128: {
	if (EltVT.getSizeInBits() != 32 && EltVT.getSizeInBits() != 64)
	return false;
	// Only change element size, not type.
	if (VT.isInteger() != Op.getSimpleValueType().isInteger())
	return false;
	return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
	Op.getOperand(2));
	}
	case ISD::INSERT_SUBVECTOR: {
	unsigned EltSize = EltVT.getSizeInBits();
	if (EltSize != 32 && EltSize != 64)
	return false;
	MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
	// Only change element size, not type.
	if (EltVT.isInteger() != OpEltVT.isInteger())
	return false;
	uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
	Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize;
	SDValue Op0 = DAG.getBitcast(VT, Op.getOperand(0));
	DCI.AddToWorklist(Op0.getNode());
	// Op1 needs to be bitcasted to a smaller vector with the same element type.
	SDValue Op1 = Op.getOperand(1);
	MVT Op1VT = MVT::getVectorVT(EltVT,
	Op1.getSimpleValueType().getSizeInBits() / EltSize);
	Op1 = DAG.getBitcast(Op1VT, Op1);
	DCI.AddToWorklist(Op1.getNode());
	DCI.CombineTo(OrigOp.getNode(),
	DAG.getNode(Opcode, DL, VT, Op0, Op1,
	DAG.getIntPtrConstant(Imm, DL)));
	return true;
	}
	case ISD::EXTRACT_SUBVECTOR: {
	unsigned EltSize = EltVT.getSizeInBits();
	if (EltSize != 32 && EltSize != 64)
	return false;
	MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
	// Only change element size, not type.
	if (EltVT.isInteger() != OpEltVT.isInteger())
	return false;
	uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
	Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize;
	// Op0 needs to be bitcasted to a larger vector with the same element type.
	SDValue Op0 = Op.getOperand(0);
	MVT Op0VT = MVT::getVectorVT(EltVT,
	Op0.getSimpleValueType().getSizeInBits() / EltSize);
	Op0 = DAG.getBitcast(Op0VT, Op0);
	DCI.AddToWorklist(Op0.getNode());
	DCI.CombineTo(OrigOp.getNode(),
	DAG.getNode(Opcode, DL, VT, Op0,
	DAG.getIntPtrConstant(Imm, DL)));
	return true;
	}
	case X86ISD::SUBV_BROADCAST: {
	unsigned EltSize = EltVT.getSizeInBits();
	if (EltSize != 32 && EltSize != 64)
	return false;
	// Only change element size, not type.
	if (VT.isInteger() != Op.getSimpleValueType().isInteger())
	return false;
	SDValue Op0 = Op.getOperand(0);
	MVT Op0VT = MVT::getVectorVT(EltVT,
	Op0.getSimpleValueType().getSizeInBits() / EltSize);
	Op0 = DAG.getBitcast(Op0VT, Op.getOperand(0));
	DCI.AddToWorklist(Op0.getNode());
	DCI.CombineTo(OrigOp.getNode(),
	DAG.getNode(Opcode, DL, VT, Op0));
	return true;
	}
	}

	return false;
	}

	/// Do target-specific dag combines on SELECT and VSELECT nodes.
	static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDLoc DL(N);
	SDValue Cond = N->getOperand(0);
	// Get the LHS/RHS of the select.
	SDValue LHS = N->getOperand(1);
	SDValue RHS = N->getOperand(2);
	EVT VT = LHS.getValueType();
	EVT CondVT = Cond.getValueType();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// If we have SSE[12] support, try to form min/max nodes. SSE min/max
	// instructions match the semantics of the common C idiom x<y?x:y but not
	// x<=y?x:y, because of how they handle negative zero (which can be
	// ignored in unsafe-math mode).
	// We also try to create v2f32 min/max nodes, which we later widen to v4f32.
	if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
	VT != MVT::f80 && VT != MVT::f128 &&
	(TLI.isTypeLegal(VT) \|\| VT == MVT::v2f32) &&
	(Subtarget.hasSSE2() \|\|
	(Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
	ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

	unsigned Opcode = 0;
	// Check for x CC y ? x : y.
	if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
	DAG.isEqualTo(RHS, Cond.getOperand(1))) {
	switch (CC) {
	default: break;
	case ISD::SETULT:
	// Converting this to a min would handle NaNs incorrectly, and swapping
	// the operands would cause it to handle comparisons between positive
	// and negative zero incorrectly.
	if (!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS)) {
	if (!DAG.getTarget().Options.UnsafeFPMath &&
	!(DAG.isKnownNeverZero(LHS) \|\| DAG.isKnownNeverZero(RHS)))
	break;
	std::swap(LHS, RHS);
	}
	Opcode = X86ISD::FMIN;
	break;
	case ISD::SETOLE:
	// Converting this to a min would handle comparisons between positive
	// and negative zero incorrectly.
	if (!DAG.getTarget().Options.UnsafeFPMath &&
	!DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
	break;
	Opcode = X86ISD::FMIN;
	break;
	case ISD::SETULE:
	// Converting this to a min would handle both negative zeros and NaNs
	// incorrectly, but we can swap the operands to fix both.
	std::swap(LHS, RHS);
	LLVM_FALLTHROUGH;
	case ISD::SETOLT:
	case ISD::SETLT:
	case ISD::SETLE:
	Opcode = X86ISD::FMIN;
	break;

	case ISD::SETOGE:
	// Converting this to a max would handle comparisons between positive
	// and negative zero incorrectly.
	if (!DAG.getTarget().Options.UnsafeFPMath &&
	!DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
	break;
	Opcode = X86ISD::FMAX;
	break;
	case ISD::SETUGT:
	// Converting this to a max would handle NaNs incorrectly, and swapping
	// the operands would cause it to handle comparisons between positive
	// and negative zero incorrectly.
	if (!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS)) {
	if (!DAG.getTarget().Options.UnsafeFPMath &&
	!(DAG.isKnownNeverZero(LHS) \|\| DAG.isKnownNeverZero(RHS)))
	break;
	std::swap(LHS, RHS);
	}
	Opcode = X86ISD::FMAX;
	break;
	case ISD::SETUGE:
	// Converting this to a max would handle both negative zeros and NaNs
	// incorrectly, but we can swap the operands to fix both.
	std::swap(LHS, RHS);
	LLVM_FALLTHROUGH;
	case ISD::SETOGT:
	case ISD::SETGT:
	case ISD::SETGE:
	Opcode = X86ISD::FMAX;
	break;
	}
	// Check for x CC y ? y : x -- a min/max with reversed arms.
	} else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
	DAG.isEqualTo(RHS, Cond.getOperand(0))) {
	switch (CC) {
	default: break;
	case ISD::SETOGE:
	// Converting this to a min would handle comparisons between positive
	// and negative zero incorrectly, and swapping the operands would
	// cause it to handle NaNs incorrectly.
	if (!DAG.getTarget().Options.UnsafeFPMath &&
	!(DAG.isKnownNeverZero(LHS) \|\| DAG.isKnownNeverZero(RHS))) {
	if (!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS))
	break;
	std::swap(LHS, RHS);
	}
	Opcode = X86ISD::FMIN;
	break;
	case ISD::SETUGT:
	// Converting this to a min would handle NaNs incorrectly.
	if (!DAG.getTarget().Options.UnsafeFPMath &&
	(!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS)))
	break;
	Opcode = X86ISD::FMIN;
	break;
	case ISD::SETUGE:
	// Converting this to a min would handle both negative zeros and NaNs
	// incorrectly, but we can swap the operands to fix both.
	std::swap(LHS, RHS);
	LLVM_FALLTHROUGH;
	case ISD::SETOGT:
	case ISD::SETGT:
	case ISD::SETGE:
	Opcode = X86ISD::FMIN;
	break;

	case ISD::SETULT:
	// Converting this to a max would handle NaNs incorrectly.
	if (!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS))
	break;
	Opcode = X86ISD::FMAX;
	break;
	case ISD::SETOLE:
	// Converting this to a max would handle comparisons between positive
	// and negative zero incorrectly, and swapping the operands would
	// cause it to handle NaNs incorrectly.
	if (!DAG.getTarget().Options.UnsafeFPMath &&
	!DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
	if (!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS))
	break;
	std::swap(LHS, RHS);
	}
	Opcode = X86ISD::FMAX;
	break;
	case ISD::SETULE:
	// Converting this to a max would handle both negative zeros and NaNs
	// incorrectly, but we can swap the operands to fix both.
	std::swap(LHS, RHS);
	LLVM_FALLTHROUGH;
	case ISD::SETOLT:
	case ISD::SETLT:
	case ISD::SETLE:
	Opcode = X86ISD::FMAX;
	break;
	}
	}

	if (Opcode)
	return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
	}

	// v16i8 (select v16i1, v16i8, v16i8) does not have a proper
	// lowering on KNL. In this case we convert it to
	// v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
	// The same situation for all 128 and 256-bit vectors of i8 and i16.
	// Since SKX these selects have a proper lowering.
	if (Subtarget.hasAVX512() && CondVT.isVector() &&
	CondVT.getVectorElementType() == MVT::i1 &&
	(VT.is128BitVector() \|\| VT.is256BitVector()) &&
	(VT.getVectorElementType() == MVT::i8 \|\|
	VT.getVectorElementType() == MVT::i16) &&
	!(Subtarget.hasBWI() && Subtarget.hasVLX())) {
	Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
	DCI.AddToWorklist(Cond.getNode());
	return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
	}

	if (SDValue V = combineSelectOfTwoConstants(N, DAG))
	return V;

	// Canonicalize max and min:
	// (x > y) ? x : y -> (x >= y) ? x : y
	// (x < y) ? x : y -> (x <= y) ? x : y
	// This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
	// the need for an extra compare
	// against zero. e.g.
	// (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
	// subl %esi, %edi
	// testl %edi, %edi
	// movl $0, %eax
	// cmovgl %edi, %eax
	// =>
	// xorl %eax, %eax
	// subl %esi, $edi
	// cmovsl %eax, %edi
	if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
	DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
	DAG.isEqualTo(RHS, Cond.getOperand(1))) {
	ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
	switch (CC) {
	default: break;
	case ISD::SETLT:
	case ISD::SETGT: {
	ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
	Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
	Cond.getOperand(0), Cond.getOperand(1), NewCC);
	return DAG.getSelect(DL, VT, Cond, LHS, RHS);
	}
	}
	}

	// Early exit check
	if (!TLI.isTypeLegal(VT))
	return SDValue();

	// Match VSELECTs into subs with unsigned saturation.
	if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
	// psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
	((Subtarget.hasSSE2() && (VT == MVT::v16i8 \|\| VT == MVT::v8i16)) \|\|
	(Subtarget.hasAVX2() && (VT == MVT::v32i8 \|\| VT == MVT::v16i16)))) {
	ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

	// Check if one of the arms of the VSELECT is a zero vector. If it's on the
	// left side invert the predicate to simplify logic below.
	SDValue Other;
	if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
	Other = RHS;
	CC = ISD::getSetCCInverse(CC, true);
	} else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
	Other = LHS;
	}

	if (Other.getNode() && Other->getNumOperands() == 2 &&
	DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
	SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
	SDValue CondRHS = Cond->getOperand(1);

	// Look for a general sub with unsigned saturation first.
	// x >= y ? x-y : 0 --> subus x, y
	// x > y ? x-y : 0 --> subus x, y
	if ((CC == ISD::SETUGE \|\| CC == ISD::SETUGT) &&
	Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
	return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);

	if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
	if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
	if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
	if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
	// If the RHS is a constant we have to reverse the const
	// canonicalization.
	// x > C-1 ? x+-C : 0 --> subus x, C
	if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
	CondRHSConst->getAPIntValue() ==
	(-OpRHSConst->getAPIntValue() - 1))
	return DAG.getNode(
	X86ISD::SUBUS, DL, VT, OpLHS,
	DAG.getConstant(-OpRHSConst->getAPIntValue(), DL, VT));

	// Another special case: If C was a sign bit, the sub has been
	// canonicalized into a xor.
	// FIXME: Would it be better to use computeKnownBits to determine
	// whether it's safe to decanonicalize the xor?
	// x s< 0 ? x^C : 0 --> subus x, C
	if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
	ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
	OpRHSConst->getAPIntValue().isSignMask())
	// Note that we have to rebuild the RHS constant here to ensure we
	// don't rely on particular values of undef lanes.
	return DAG.getNode(
	X86ISD::SUBUS, DL, VT, OpLHS,
	DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT));
	}
	}
	}

	if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
	return V;

	// If this is a dynamic select (non-constant condition) and we can match
	// this node with one of the variable blend instructions, restructure the
	// condition so that blends can use the high (sign) bit of each element and
	// use SimplifyDemandedBits to simplify the condition operand.
	if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
	!DCI.isBeforeLegalize() &&
	!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
	unsigned BitWidth = Cond.getScalarValueSizeInBits();

	// Don't optimize vector selects that map to mask-registers.
	if (BitWidth == 1)
	return SDValue();

	// We can only handle the cases where VSELECT is directly legal on the
	// subtarget. We custom lower VSELECT nodes with constant conditions and
	// this makes it hard to see whether a dynamic VSELECT will correctly
	// lower, so we both check the operation's status and explicitly handle the
	// cases where a dynamic blend will fail even though a constant-condition
	// blend could be custom lowered.
	// FIXME: We should find a better way to handle this class of problems.
	// Potentially, we should combine constant-condition vselect nodes
	// pre-legalization into shuffles and not mark as many types as custom
	// lowered.
	if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
	return SDValue();
	// FIXME: We don't support i16-element blends currently. We could and
	// should support them by making all the bits in the condition be set
	// rather than just the high bit and using an i8-element blend.
	if (VT.getVectorElementType() == MVT::i16)
	return SDValue();
	// Dynamic blending was only available from SSE4.1 onward.
	if (VT.is128BitVector() && !Subtarget.hasSSE41())
	return SDValue();
	// Byte blends are only available in AVX2
	if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
	return SDValue();

	assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
	APInt DemandedMask(APInt::getSignMask(BitWidth));
	KnownBits Known;
	TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
	!DCI.isBeforeLegalizeOps());
	if (TLI.ShrinkDemandedConstant(Cond, DemandedMask, TLO) \|\|
	TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO)) {
	// If we changed the computation somewhere in the DAG, this change will
	// affect all users of Cond. Make sure it is fine and update all the nodes
	// so that we do not use the generic VSELECT anymore. Otherwise, we may
	// perform wrong optimizations as we messed with the actual expectation
	// for the vector boolean values.
	if (Cond != TLO.Old) {
	// Check all uses of the condition operand to check whether it will be
	// consumed by non-BLEND instructions. Those may require that all bits
	// are set properly.
	for (SDNode *U : Cond->uses()) {
	// TODO: Add other opcodes eventually lowered into BLEND.
	if (U->getOpcode() != ISD::VSELECT)
	return SDValue();
	}

	// Update all users of the condition before committing the change, so
	// that the VSELECT optimizations that expect the correct vector boolean
	// value will not be triggered.
	for (SDNode *U : Cond->uses()) {
	SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(U),
	U->getValueType(0), Cond, U->getOperand(1),
	U->getOperand(2));
	DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
	}
	DCI.CommitTargetLoweringOpt(TLO);
	return SDValue();
	}
	// Only Cond (rather than other nodes in the computation chain) was
	// changed. Change the condition just for N to keep the opportunity to
	// optimize all other users their own way.
	SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, DL, VT, TLO.New, LHS, RHS);
	DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), SB);
	return SDValue();
	}
	}

	// Look for vselects with LHS/RHS being bitcasted from an operation that
	// can be executed on another type. Push the bitcast to the inputs of
	// the operation. This exposes opportunities for using masking instructions.
	if (N->getOpcode() == ISD::VSELECT && DCI.isAfterLegalizeVectorOps() &&
	CondVT.getVectorElementType() == MVT::i1) {
	if (combineBitcastForMaskedOp(LHS, DAG, DCI))
	return SDValue(N, 0);
	if (combineBitcastForMaskedOp(RHS, DAG, DCI))
	return SDValue(N, 0);
	}

	+ // Custom action for SELECT MMX
	+ if (VT == MVT::x86mmx) {
	+ LHS = DAG.getBitcast(MVT::i64, LHS);
	+ RHS = DAG.getBitcast(MVT::i64, RHS);
	+ SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::i64, Cond, LHS, RHS);
	+ return DAG.getBitcast(VT, newSelect);
	+ }
	+
	return SDValue();
	}

	/// Combine:
	/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
	/// to:
	/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
	/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
	/// Note that this is only legal for some op/cc combinations.
	static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
	SelectionDAG &DAG) {
	// This combine only operates on CMP-like nodes.
	if (!(Cmp.getOpcode() == X86ISD::CMP \|\|
	(Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
	return SDValue();

	// Can't replace the cmp if it has more uses than the one we're looking at.
	// FIXME: We would like to be able to handle this, but would need to make sure
	// all uses were updated.
	if (!Cmp.hasOneUse())
	return SDValue();

	// This only applies to variations of the common case:
	// (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
	// (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
	// (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
	// (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
	// Using the proper condcodes (see below), overflow is checked for.

	// FIXME: We can generalize both constraints:
	// - XOR/OR/AND (if they were made to survive AtomicExpand)
	// - LHS != 1
	// if the result is compared.

	SDValue CmpLHS = Cmp.getOperand(0);
	SDValue CmpRHS = Cmp.getOperand(1);

	if (!CmpLHS.hasOneUse())
	return SDValue();

	auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
	if (!CmpRHSC \|\| CmpRHSC->getZExtValue() != 0)
	return SDValue();

	const unsigned Opc = CmpLHS.getOpcode();

	if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
	return SDValue();

	SDValue OpRHS = CmpLHS.getOperand(2);
	auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
	if (!OpRHSC)
	return SDValue();

	APInt Addend = OpRHSC->getAPIntValue();
	if (Opc == ISD::ATOMIC_LOAD_SUB)
	Addend = -Addend;

	if (CC == X86::COND_S && Addend == 1)
	CC = X86::COND_LE;
	else if (CC == X86::COND_NS && Addend == 1)
	CC = X86::COND_G;
	else if (CC == X86::COND_G && Addend == -1)
	CC = X86::COND_GE;
	else if (CC == X86::COND_LE && Addend == -1)
	CC = X86::COND_L;
	else
	return SDValue();

	SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG);
	DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
	DAG.getUNDEF(CmpLHS.getValueType()));
	DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
	return LockOp;
	}

	// Check whether a boolean test is testing a boolean value generated by
	// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
	// code.
	//
	// Simplify the following patterns:
	// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
	// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
	// to (Op EFLAGS Cond)
	//
	// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
	// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
	// to (Op EFLAGS !Cond)
	//
	// where Op could be BRCOND or CMOV.
	//
	static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
	// This combine only operates on CMP-like nodes.
	if (!(Cmp.getOpcode() == X86ISD::CMP \|\|
	(Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
	return SDValue();

	// Quit if not used as a boolean value.
	if (CC != X86::COND_E && CC != X86::COND_NE)
	return SDValue();

	// Check CMP operands. One of them should be 0 or 1 and the other should be
	// an SetCC or extended from it.
	SDValue Op1 = Cmp.getOperand(0);
	SDValue Op2 = Cmp.getOperand(1);

	SDValue SetCC;
	const ConstantSDNode* C = nullptr;
	bool needOppositeCond = (CC == X86::COND_E);
	bool checkAgainstTrue = false; // Is it a comparison against 1?

	if ((C = dyn_cast<ConstantSDNode>(Op1)))
	SetCC = Op2;
	else if ((C = dyn_cast<ConstantSDNode>(Op2)))
	SetCC = Op1;
	else // Quit if all operands are not constants.
	return SDValue();

	if (C->getZExtValue() == 1) {
	needOppositeCond = !needOppositeCond;
	checkAgainstTrue = true;
	} else if (C->getZExtValue() != 0)
	// Quit if the constant is neither 0 or 1.
	return SDValue();

	bool truncatedToBoolWithAnd = false;
	// Skip (zext $x), (trunc $x), or (and $x, 1) node.
	while (SetCC.getOpcode() == ISD::ZERO_EXTEND \|\|
	SetCC.getOpcode() == ISD::TRUNCATE \|\|
	SetCC.getOpcode() == ISD::AND) {
	if (SetCC.getOpcode() == ISD::AND) {
	int OpIdx = -1;
	if (isOneConstant(SetCC.getOperand(0)))
	OpIdx = 1;
	if (isOneConstant(SetCC.getOperand(1)))
	OpIdx = 0;
	if (OpIdx < 0)
	break;
	SetCC = SetCC.getOperand(OpIdx);
	truncatedToBoolWithAnd = true;
	} else
	SetCC = SetCC.getOperand(0);
	}

	switch (SetCC.getOpcode()) {
	case X86ISD::SETCC_CARRY:
	// Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
	// simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
	// i.e. it's a comparison against true but the result of SETCC_CARRY is not
	// truncated to i1 using 'and'.
	if (checkAgainstTrue && !truncatedToBoolWithAnd)
	break;
	assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
	"Invalid use of SETCC_CARRY!");
	LLVM_FALLTHROUGH;
	case X86ISD::SETCC:
	// Set the condition code or opposite one if necessary.
	CC = X86::CondCode(SetCC.getConstantOperandVal(0));
	if (needOppositeCond)
	CC = X86::GetOppositeBranchCondition(CC);
	return SetCC.getOperand(1);
	case X86ISD::CMOV: {
	// Check whether false/true value has canonical one, i.e. 0 or 1.
	ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
	ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
	// Quit if true value is not a constant.
	if (!TVal)
	return SDValue();
	// Quit if false value is not a constant.
	if (!FVal) {
	SDValue Op = SetCC.getOperand(0);
	// Skip 'zext' or 'trunc' node.
	if (Op.getOpcode() == ISD::ZERO_EXTEND \|\|
	Op.getOpcode() == ISD::TRUNCATE)
	Op = Op.getOperand(0);
	// A special case for rdrand/rdseed, where 0 is set if false cond is
	// found.
	if ((Op.getOpcode() != X86ISD::RDRAND &&
	Op.getOpcode() != X86ISD::RDSEED) \|\| Op.getResNo() != 0)
	return SDValue();
	}
	// Quit if false value is not the constant 0 or 1.
	bool FValIsFalse = true;
	if (FVal && FVal->getZExtValue() != 0) {
	if (FVal->getZExtValue() != 1)
	return SDValue();
	// If FVal is 1, opposite cond is needed.
	needOppositeCond = !needOppositeCond;
	FValIsFalse = false;
	}
	// Quit if TVal is not the constant opposite of FVal.
	if (FValIsFalse && TVal->getZExtValue() != 1)
	return SDValue();
	if (!FValIsFalse && TVal->getZExtValue() != 0)
	return SDValue();
	CC = X86::CondCode(SetCC.getConstantOperandVal(2));
	if (needOppositeCond)
	CC = X86::GetOppositeBranchCondition(CC);
	return SetCC.getOperand(3);
	}
	}

	return SDValue();
	}

	/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
	/// Match:
	/// (X86or (X86setcc) (X86setcc))
	/// (X86cmp (and (X86setcc) (X86setcc)), 0)
	static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
	X86::CondCode &CC1, SDValue &Flags,
	bool &isAnd) {
	if (Cond->getOpcode() == X86ISD::CMP) {
	if (!isNullConstant(Cond->getOperand(1)))
	return false;

	Cond = Cond->getOperand(0);
	}

	isAnd = false;

	SDValue SetCC0, SetCC1;
	switch (Cond->getOpcode()) {
	default: return false;
	case ISD::AND:
	case X86ISD::AND:
	isAnd = true;
	LLVM_FALLTHROUGH;
	case ISD::OR:
	case X86ISD::OR:
	SetCC0 = Cond->getOperand(0);
	SetCC1 = Cond->getOperand(1);
	break;
	};

	// Make sure we have SETCC nodes, using the same flags value.
	if (SetCC0.getOpcode() != X86ISD::SETCC \|\|
	SetCC1.getOpcode() != X86ISD::SETCC \|\|
	SetCC0->getOperand(1) != SetCC1->getOperand(1))
	return false;

	CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
	CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
	Flags = SetCC0->getOperand(1);
	return true;
	}

	/// Optimize an EFLAGS definition used according to the condition code \p CC
	/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
	/// uses of chain values.
	static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
	SelectionDAG &DAG) {
	if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
	return R;
	return combineSetCCAtomicArith(EFLAGS, CC, DAG);
	}

	/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
	static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDLoc DL(N);

	// If the flag operand isn't dead, don't touch this CMOV.
	if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
	return SDValue();

	SDValue FalseOp = N->getOperand(0);
	SDValue TrueOp = N->getOperand(1);
	X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
	SDValue Cond = N->getOperand(3);

	if (CC == X86::COND_E \|\| CC == X86::COND_NE) {
	switch (Cond.getOpcode()) {
	default: break;
	case X86ISD::BSR:
	case X86ISD::BSF:
	// If operand of BSR / BSF are proven never zero, then ZF cannot be set.
	if (DAG.isKnownNeverZero(Cond.getOperand(0)))
	return (CC == X86::COND_E) ? FalseOp : TrueOp;
	}
	}

	// Try to simplify the EFLAGS and condition code operands.
	// We can't always do this as FCMOV only supports a subset of X86 cond.
	if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG)) {
	if (FalseOp.getValueType() != MVT::f80 \|\| hasFPCMov(CC)) {
	SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
	Flags};
	return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
	}
	}

	// If this is a select between two integer constants, try to do some
	// optimizations. Note that the operands are ordered the opposite of SELECT
	// operands.
	if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
	if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
	// Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
	// larger than FalseC (the false value).
	if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
	CC = X86::GetOppositeBranchCondition(CC);
	std::swap(TrueC, FalseC);
	std::swap(TrueOp, FalseOp);
	}

	// Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
	// This is efficient for any integer data type (including i8/i16) and
	// shift amount.
	if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
	Cond = getSETCC(CC, Cond, DL, DAG);

	// Zero extend the condition if needed.
	Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);

	unsigned ShAmt = TrueC->getAPIntValue().logBase2();
	Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
	DAG.getConstant(ShAmt, DL, MVT::i8));
	if (N->getNumValues() == 2) // Dead flag value?
	return DCI.CombineTo(N, Cond, SDValue());
	return Cond;
	}

	// Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
	// for any integer data type, including i8/i16.
	if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
	Cond = getSETCC(CC, Cond, DL, DAG);

	// Zero extend the condition if needed.
	Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
	FalseC->getValueType(0), Cond);
	Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
	SDValue(FalseC, 0));

	if (N->getNumValues() == 2) // Dead flag value?
	return DCI.CombineTo(N, Cond, SDValue());
	return Cond;
	}

	// Optimize cases that will turn into an LEA instruction. This requires
	// an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
	if (N->getValueType(0) == MVT::i32 \|\| N->getValueType(0) == MVT::i64) {
	uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
	if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;

	bool isFastMultiplier = false;
	if (Diff < 10) {
	switch ((unsigned char)Diff) {
	default: break;
	case 1: // result = add base, cond
	case 2: // result = lea base( , cond*2)
	case 3: // result = lea base(cond, cond*2)
	case 4: // result = lea base( , cond*4)
	case 5: // result = lea base(cond, cond*4)
	case 8: // result = lea base( , cond*8)
	case 9: // result = lea base(cond, cond*8)
	isFastMultiplier = true;
	break;
	}
	}

	if (isFastMultiplier) {
	APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
	Cond = getSETCC(CC, Cond, DL ,DAG);
	// Zero extend the condition if needed.
	Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
	Cond);
	// Scale the condition by the difference.
	if (Diff != 1)
	Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
	DAG.getConstant(Diff, DL, Cond.getValueType()));

	// Add the base if non-zero.
	if (FalseC->getAPIntValue() != 0)
	Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
	SDValue(FalseC, 0));
	if (N->getNumValues() == 2) // Dead flag value?
	return DCI.CombineTo(N, Cond, SDValue());
	return Cond;
	}
	}
	}
	}

	// Handle these cases:
	// (select (x != c), e, c) -> select (x != c), e, x),
	// (select (x == c), c, e) -> select (x == c), x, e)
	// where the c is an integer constant, and the "select" is the combination
	// of CMOV and CMP.
	//
	// The rationale for this change is that the conditional-move from a constant
	// needs two instructions, however, conditional-move from a register needs
	// only one instruction.
	//
	// CAVEAT: By replacing a constant with a symbolic value, it may obscure
	// some instruction-combining opportunities. This opt needs to be
	// postponed as late as possible.
	//
	if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
	// the DCI.xxxx conditions are provided to postpone the optimization as
	// late as possible.

	ConstantSDNode *CmpAgainst = nullptr;
	if ((Cond.getOpcode() == X86ISD::CMP \|\| Cond.getOpcode() == X86ISD::SUB) &&
	(CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
	!isa<ConstantSDNode>(Cond.getOperand(0))) {

	if (CC == X86::COND_NE &&
	CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
	CC = X86::GetOppositeBranchCondition(CC);
	std::swap(TrueOp, FalseOp);
	}

	if (CC == X86::COND_E &&
	CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
	SDValue Ops[] = { FalseOp, Cond.getOperand(0),
	DAG.getConstant(CC, DL, MVT::i8), Cond };
	return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops);
	}
	}
	}

	// Fold and/or of setcc's to double CMOV:
	// (CMOV F, T, ((cc1 \| cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
	// (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
	//
	// This combine lets us generate:
	// cmovcc1 (jcc1 if we don't have CMOV)
	// cmovcc2 (same)
	// instead of:
	// setcc1
	// setcc2
	// and/or
	// cmovne (jne if we don't have CMOV)
	// When we can't use the CMOV instruction, it might increase branch
	// mispredicts.
	// When we can use CMOV, or when there is no mispredict, this improves
	// throughput and reduces register pressure.
	//
	if (CC == X86::COND_NE) {
	SDValue Flags;
	X86::CondCode CC0, CC1;
	bool isAndSetCC;
	if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
	if (isAndSetCC) {
	std::swap(FalseOp, TrueOp);
	CC0 = X86::GetOppositeBranchCondition(CC0);
	CC1 = X86::GetOppositeBranchCondition(CC1);
	}

	SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
	Flags};
	SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), LOps);
	SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
	SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
	DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(CMOV.getNode(), 1));
	return CMOV;
	}
	}

	return SDValue();
	}

	/// Different mul shrinking modes.
	enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 };

	static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
	EVT VT = N->getOperand(0).getValueType();
	if (VT.getScalarSizeInBits() != 32)
	return false;

	assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
	unsigned SignBits[2] = {1, 1};
	bool IsPositive[2] = {false, false};
	for (unsigned i = 0; i < 2; i++) {
	SDValue Opd = N->getOperand(i);

	// DAG.ComputeNumSignBits return 1 for ISD::ANY_EXTEND, so we need to
	// compute signbits for it separately.
	if (Opd.getOpcode() == ISD::ANY_EXTEND) {
	// For anyextend, it is safe to assume an appropriate number of leading
	// sign/zero bits.
	if (Opd.getOperand(0).getValueType().getVectorElementType() == MVT::i8)
	SignBits[i] = 25;
	else if (Opd.getOperand(0).getValueType().getVectorElementType() ==
	MVT::i16)
	SignBits[i] = 17;
	else
	return false;
	IsPositive[i] = true;
	} else if (Opd.getOpcode() == ISD::BUILD_VECTOR) {
	// All the operands of BUILD_VECTOR need to be int constant.
	// Find the smallest value range which all the operands belong to.
	SignBits[i] = 32;
	IsPositive[i] = true;
	for (const SDValue &SubOp : Opd.getNode()->op_values()) {
	if (SubOp.isUndef())
	continue;
	auto *CN = dyn_cast<ConstantSDNode>(SubOp);
	if (!CN)
	return false;
	APInt IntVal = CN->getAPIntValue();
	if (IntVal.isNegative())
	IsPositive[i] = false;
	SignBits[i] = std::min(SignBits[i], IntVal.getNumSignBits());
	}
	} else {
	SignBits[i] = DAG.ComputeNumSignBits(Opd);
	if (Opd.getOpcode() == ISD::ZERO_EXTEND)
	IsPositive[i] = true;
	}
	}

	bool AllPositive = IsPositive[0] && IsPositive[1];
	unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
	// When ranges are from -128 ~ 127, use MULS8 mode.
	if (MinSignBits >= 25)
	Mode = MULS8;
	// When ranges are from 0 ~ 255, use MULU8 mode.
	else if (AllPositive && MinSignBits >= 24)
	Mode = MULU8;
	// When ranges are from -32768 ~ 32767, use MULS16 mode.
	else if (MinSignBits >= 17)
	Mode = MULS16;
	// When ranges are from 0 ~ 65535, use MULU16 mode.
	else if (AllPositive && MinSignBits >= 16)
	Mode = MULU16;
	else
	return false;
	return true;
	}

	/// When the operands of vector mul are extended from smaller size values,
	/// like i8 and i16, the type of mul may be shrinked to generate more
	/// efficient code. Two typical patterns are handled:
	/// Pattern1:
	/// %2 = sext/zext <N x i8> %1 to <N x i32>
	/// %4 = sext/zext <N x i8> %3 to <N x i32>
	// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
	/// %5 = mul <N x i32> %2, %4
	///
	/// Pattern2:
	/// %2 = zext/sext <N x i16> %1 to <N x i32>
	/// %4 = zext/sext <N x i16> %3 to <N x i32>
	/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
	/// %5 = mul <N x i32> %2, %4
	///
	/// There are four mul shrinking modes:
	/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
	/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
	/// generate pmullw+sext32 for it (MULS8 mode).
	/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
	/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
	/// generate pmullw+zext32 for it (MULU8 mode).
	/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
	/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
	/// generate pmullw+pmulhw for it (MULS16 mode).
	/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
	/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
	/// generate pmullw+pmulhuw for it (MULU16 mode).
	static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// Check for legality
	// pmullw/pmulhw are not supported by SSE.
	if (!Subtarget.hasSSE2())
	return SDValue();

	// Check for profitability
	// pmulld is supported since SSE41. It is better to use pmulld
	// instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
	// the expansion.
	bool OptForMinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
	if (Subtarget.hasSSE41() && (OptForMinSize \|\| !Subtarget.isPMULLDSlow()))
	return SDValue();

	ShrinkMode Mode;
	if (!canReduceVMulWidth(N, DAG, Mode))
	return SDValue();

	SDLoc DL(N);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getOperand(0).getValueType();
	unsigned RegSize = 128;
	MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
	EVT ReducedVT =
	EVT::getVectorVT(*DAG.getContext(), MVT::i16, VT.getVectorNumElements());
	// Shrink the operands of mul.
	SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
	SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);

	if (VT.getVectorNumElements() >= OpsVT.getVectorNumElements()) {
	// Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
	// lower part is needed.
	SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
	if (Mode == MULU8 \|\| Mode == MULS8) {
	return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
	DL, VT, MulLo);
	} else {
	MVT ResVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
	// Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
	// the higher part is also needed.
	SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
	ReducedVT, NewN0, NewN1);

	// Repack the lower part and higher part result of mul into a wider
	// result.
	// Generate shuffle functioning as punpcklwd.
	SmallVector<int, 16> ShuffleMask(VT.getVectorNumElements());
	for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
	ShuffleMask[2 * i] = i;
	ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements();
	}
	SDValue ResLo =
	DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
	ResLo = DAG.getNode(ISD::BITCAST, DL, ResVT, ResLo);
	// Generate shuffle functioning as punpckhwd.
	for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
	ShuffleMask[2 * i] = i + VT.getVectorNumElements() / 2;
	ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements() * 3 / 2;
	}
	SDValue ResHi =
	DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
	ResHi = DAG.getNode(ISD::BITCAST, DL, ResVT, ResHi);
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
	}
	} else {
	// When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want
	// to legalize the mul explicitly because implicit legalization for type
	// <4 x i16> to <4 x i32> sometimes involves unnecessary unpack
	// instructions which will not exist when we explicitly legalize it by
	// extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with
	// <4 x i16> undef).
	//
	// Legalize the operands of mul.
	// FIXME: We may be able to handle non-concatenated vectors by insertion.
	unsigned ReducedSizeInBits = ReducedVT.getSizeInBits();
	if ((RegSize % ReducedSizeInBits) != 0)
	return SDValue();

	SmallVector<SDValue, 16> Ops(RegSize / ReducedSizeInBits,
	DAG.getUNDEF(ReducedVT));
	Ops[0] = NewN0;
	NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
	Ops[0] = NewN1;
	NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);

	if (Mode == MULU8 \|\| Mode == MULS8) {
	// Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower
	// part is needed.
	SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);

	// convert the type of mul result to VT.
	MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
	SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG
	: ISD::SIGN_EXTEND_VECTOR_INREG,
	DL, ResVT, Mul);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
	DAG.getIntPtrConstant(0, DL));
	} else {
	// Generate the lower and higher part of mul: pmulhw/pmulhuw. For
	// MULU16/MULS16, both parts are needed.
	SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
	SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
	OpsVT, NewN0, NewN1);

	// Repack the lower part and higher part result of mul into a wider
	// result. Make sure the type of mul result is VT.
	MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
	SDValue Res = DAG.getNode(X86ISD::UNPCKL, DL, OpsVT, MulLo, MulHi);
	Res = DAG.getNode(ISD::BITCAST, DL, ResVT, Res);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
	DAG.getIntPtrConstant(0, DL));
	}
	}
	}

	static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
	EVT VT, SDLoc DL) {

	auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
	SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
	DAG.getConstant(Mult, DL, VT));
	Result = DAG.getNode(ISD::SHL, DL, VT, Result,
	DAG.getConstant(Shift, DL, MVT::i8));
	Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
	N->getOperand(0));
	return Result;
	};

	auto combineMulMulAddOrSub = [&](bool isAdd) {
	SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
	DAG.getConstant(9, DL, VT));
	Result = DAG.getNode(ISD::MUL, DL, VT, Result, DAG.getConstant(3, DL, VT));
	Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
	N->getOperand(0));
	return Result;
	};

	switch (MulAmt) {
	default:
	break;
	case 11:
	// mul x, 11 => add ((shl (mul x, 5), 1), x)
	return combineMulShlAddOrSub(5, 1, /isAdd/ true);
	case 21:
	// mul x, 21 => add ((shl (mul x, 5), 2), x)
	return combineMulShlAddOrSub(5, 2, /isAdd/ true);
	case 22:
	// mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
	return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
	combineMulShlAddOrSub(5, 2, /isAdd/ true));
	case 19:
	// mul x, 19 => sub ((shl (mul x, 5), 2), x)
	return combineMulShlAddOrSub(5, 2, /isAdd/ false);
	case 13:
	// mul x, 13 => add ((shl (mul x, 3), 2), x)
	return combineMulShlAddOrSub(3, 2, /isAdd/ true);
	case 23:
	// mul x, 13 => sub ((shl (mul x, 3), 3), x)
	return combineMulShlAddOrSub(3, 3, /isAdd/ false);
	case 14:
	// mul x, 14 => add (add ((shl (mul x, 3), 2), x), x)
	return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
	combineMulShlAddOrSub(3, 2, /isAdd/ true));
	case 26:
	// mul x, 26 => sub ((mul (mul x, 9), 3), x)
	return combineMulMulAddOrSub(/isAdd/ false);
	case 28:
	// mul x, 28 => add ((mul (mul x, 9), 3), x)
	return combineMulMulAddOrSub(/isAdd/ true);
	case 29:
	// mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
	return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
	combineMulMulAddOrSub(/isAdd/ true));
	case 30:
	// mul x, 30 => sub (sub ((shl x, 5), x), x)
	return DAG.getNode(
	ISD::SUB, DL, VT,
	DAG.getNode(ISD::SUB, DL, VT,
	DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant(5, DL, MVT::i8)),
	N->getOperand(0)),
	N->getOperand(0));
	}
	return SDValue();
	}

	/// Optimize a single multiply with constant into two operations in order to
	/// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
	static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	if (DCI.isBeforeLegalize() && VT.isVector())
	return reduceVMULWidth(N, DAG, Subtarget);

	if (!MulConstantOptimization)
	return SDValue();
	// An imul is usually smaller than the alternative sequence.
	if (DAG.getMachineFunction().getFunction()->optForMinSize())
	return SDValue();

	if (DCI.isBeforeLegalize() \|\| DCI.isCalledByLegalizer())
	return SDValue();

	if (VT != MVT::i64 && VT != MVT::i32)
	return SDValue();

	ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
	if (!C)
	return SDValue();
	uint64_t MulAmt = C->getZExtValue();
	if (isPowerOf2_64(MulAmt) \|\| MulAmt == 3 \|\| MulAmt == 5 \|\| MulAmt == 9)
	return SDValue();

	uint64_t MulAmt1 = 0;
	uint64_t MulAmt2 = 0;
	if ((MulAmt % 9) == 0) {
	MulAmt1 = 9;
	MulAmt2 = MulAmt / 9;
	} else if ((MulAmt % 5) == 0) {
	MulAmt1 = 5;
	MulAmt2 = MulAmt / 5;
	} else if ((MulAmt % 3) == 0) {
	MulAmt1 = 3;
	MulAmt2 = MulAmt / 3;
	}

	SDLoc DL(N);
	SDValue NewMul;
	if (MulAmt2 &&
	(isPowerOf2_64(MulAmt2) \|\| MulAmt2 == 3 \|\| MulAmt2 == 5 \|\| MulAmt2 == 9)){

	if (isPowerOf2_64(MulAmt2) &&
	!(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
	// If second multiplifer is pow2, issue it first. We want the multiply by
	// 3, 5, or 9 to be folded into the addressing mode unless the lone use
	// is an add.
	std::swap(MulAmt1, MulAmt2);

	if (isPowerOf2_64(MulAmt1))
	NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
	else
	NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
	DAG.getConstant(MulAmt1, DL, VT));

	if (isPowerOf2_64(MulAmt2))
	NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
	DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
	else
	NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
	DAG.getConstant(MulAmt2, DL, VT));
	} else if (!Subtarget.slowLEA())
	NewMul = combineMulSpecial(MulAmt, N, DAG, VT, DL);

	if (!NewMul) {
	assert(MulAmt != 0 &&
	MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&
	"Both cases that could cause potential overflows should have "
	"already been handled.");
	int64_t SignMulAmt = C->getSExtValue();
	if ((SignMulAmt != INT64_MIN) && (SignMulAmt != INT64_MAX) &&
	(SignMulAmt != -INT64_MAX)) {
	int NumSign = SignMulAmt > 0 ? 1 : -1;
	bool IsPowerOf2_64PlusOne = isPowerOf2_64(NumSign * SignMulAmt - 1);
	bool IsPowerOf2_64MinusOne = isPowerOf2_64(NumSign * SignMulAmt + 1);
	if (IsPowerOf2_64PlusOne) {
	// (mul x, 2^N + 1) => (add (shl x, N), x)
	NewMul = DAG.getNode(
	ISD::ADD, DL, VT, N->getOperand(0),
	DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant(Log2_64(NumSign * SignMulAmt - 1), DL,
	MVT::i8)));
	} else if (IsPowerOf2_64MinusOne) {
	// (mul x, 2^N - 1) => (sub (shl x, N), x)
	NewMul = DAG.getNode(
	ISD::SUB, DL, VT,
	DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant(Log2_64(NumSign * SignMulAmt + 1), DL,
	MVT::i8)),
	N->getOperand(0));
	}
	// To negate, subtract the number from zero
	if ((IsPowerOf2_64PlusOne \|\| IsPowerOf2_64MinusOne) && NumSign == -1)
	NewMul =
	DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul);
	}
	}

	if (NewMul)
	// Do not add new nodes to DAG combiner worklist.
	DCI.CombineTo(N, NewMul, false);

	return SDValue();
	}

	static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
	EVT VT = N0.getValueType();

	// fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
	// since the result of setcc_c is all zero's or all ones.
	if (VT.isInteger() && !VT.isVector() &&
	N1C && N0.getOpcode() == ISD::AND &&
	N0.getOperand(1).getOpcode() == ISD::Constant) {
	SDValue N00 = N0.getOperand(0);
	APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
	Mask <<= N1C->getAPIntValue();
	bool MaskOK = false;
	// We can handle cases concerning bit-widening nodes containing setcc_c if
	// we carefully interrogate the mask to make sure we are semantics
	// preserving.
	// The transform is not safe if the result of C1 << C2 exceeds the bitwidth
	// of the underlying setcc_c operation if the setcc_c was zero extended.
	// Consider the following example:
	// zext(setcc_c) -> i32 0x0000FFFF
	// c1 -> i32 0x0000FFFF
	// c2 -> i32 0x00000001
	// (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
	// (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
	if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
	MaskOK = true;
	} else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
	N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
	MaskOK = true;
	} else if ((N00.getOpcode() == ISD::ZERO_EXTEND \|\|
	N00.getOpcode() == ISD::ANY_EXTEND) &&
	N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
	MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
	}
	if (MaskOK && Mask != 0) {
	SDLoc DL(N);
	return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
	}
	}

	// Hardware support for vector shifts is sparse which makes us scalarize the
	// vector operations in many cases. Also, on sandybridge ADD is faster than
	// shl.
	// (shl V, 1) -> add V,V
	if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
	if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
	assert(N0.getValueType().isVector() && "Invalid vector shift type");
	// We shift all of the values by one. In many cases we do not have
	// hardware support for this operation. This is better expressed as an ADD
	// of two values.
	if (N1SplatC->getAPIntValue() == 1)
	return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
	}

	return SDValue();
	}

	static SDValue combineShiftRightAlgebraic(SDNode *N, SelectionDAG &DAG) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N0.getValueType();
	unsigned Size = VT.getSizeInBits();

	// fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
	// into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
	// into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
	// depending on sign of (SarConst - [56,48,32,24,16])

	// sexts in X86 are MOVs. The MOVs have the same code size
	// as above SHIFTs (only SHIFT on 1 has lower code size).
	// However the MOVs have 2 advantages to a SHIFT:
	// 1. MOVs can write to a register that differs from source
	// 2. MOVs accept memory operands

	if (!VT.isInteger() \|\| VT.isVector() \|\| N1.getOpcode() != ISD::Constant \|\|
	N0.getOpcode() != ISD::SHL \|\| !N0.hasOneUse() \|\|
	N0.getOperand(1).getOpcode() != ISD::Constant)
	return SDValue();

	SDValue N00 = N0.getOperand(0);
	SDValue N01 = N0.getOperand(1);
	APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
	APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
	EVT CVT = N1.getValueType();

	if (SarConst.isNegative())
	return SDValue();

	for (MVT SVT : MVT::integer_valuetypes()) {
	unsigned ShiftSize = SVT.getSizeInBits();
	// skipping types without corresponding sext/zext and
	// ShlConst that is not one of [56,48,32,24,16]
	if (ShiftSize < 8 \|\| ShiftSize > 64 \|\| ShlConst != Size - ShiftSize)
	continue;
	SDLoc DL(N);
	SDValue NN =
	DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
	SarConst = SarConst - (Size - ShiftSize);
	if (SarConst == 0)
	return NN;
	else if (SarConst.isNegative())
	return DAG.getNode(ISD::SHL, DL, VT, NN,
	DAG.getConstant(-SarConst, DL, CVT));
	else
	return DAG.getNode(ISD::SRA, DL, VT, NN,
	DAG.getConstant(SarConst, DL, CVT));
	}
	return SDValue();
	}

	/// \brief Returns a vector of 0s if the node in input is a vector logical
	/// shift by a constant amount which is known to be bigger than or equal
	/// to the vector element size in bits.
	static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);

	if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
	(!Subtarget.hasInt256() \|\|
	(VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
	return SDValue();

	SDValue Amt = N->getOperand(1);
	SDLoc DL(N);
	if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
	if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
	const APInt &ShiftAmt = AmtSplat->getAPIntValue();
	unsigned MaxAmount =
	VT.getSimpleVT().getScalarSizeInBits();

	// SSE2/AVX2 logical shifts always return a vector of 0s
	// if the shift amount is bigger than or equal to
	// the element size. The constant shift amount will be
	// encoded as a 8-bit immediate.
	if (ShiftAmt.trunc(8).uge(MaxAmount))
	return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, DL);
	}

	return SDValue();
	}

	static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (N->getOpcode() == ISD::SHL)
	if (SDValue V = combineShiftLeft(N, DAG))
	return V;

	if (N->getOpcode() == ISD::SRA)
	if (SDValue V = combineShiftRightAlgebraic(N, DAG))
	return V;

	// Try to fold this logical shift into a zero vector.
	if (N->getOpcode() != ISD::SRA)
	if (SDValue V = performShiftToAllZeros(N, DAG, Subtarget))
	return V;

	return SDValue();
	}

	static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	unsigned Opcode = N->getOpcode();
	assert((X86ISD::VSHLI == Opcode \|\| X86ISD::VSRAI == Opcode \|\|
	X86ISD::VSRLI == Opcode) &&
	"Unexpected shift opcode");
	bool LogicalShift = X86ISD::VSHLI == Opcode \|\| X86ISD::VSRLI == Opcode;
	EVT VT = N->getValueType(0);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	unsigned NumBitsPerElt = VT.getScalarSizeInBits();
	assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
	"Unexpected value type");

	// Out of range logical bit shifts are guaranteed to be zero.
	// Out of range arithmetic bit shifts splat the sign bit.
	APInt ShiftVal = cast<ConstantSDNode>(N1)->getAPIntValue();
	if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt)) {
	if (LogicalShift)
	return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
	else
	ShiftVal = NumBitsPerElt - 1;
	}

	// Shift N0 by zero -> N0.
	if (!ShiftVal)
	return N0;

	// Shift zero -> zero.
	if (ISD::isBuildVectorAllZeros(N0.getNode()))
	return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));

	// fold (VSRLI (VSRAI X, Y), 31) -> (VSRLI X, 31).
	// This VSRLI only looks at the sign bit, which is unmodified by VSRAI.
	// TODO - support other sra opcodes as needed.
	if (Opcode == X86ISD::VSRLI && (ShiftVal + 1) == NumBitsPerElt &&
	N0.getOpcode() == X86ISD::VSRAI)
	return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, N0.getOperand(0), N1);

	// We can decode 'whole byte' logical bit shifts as shuffles.
	if (LogicalShift && (ShiftVal.getZExtValue() % 8) == 0) {
	SDValue Op(N, 0);
	SmallVector<int, 1> NonceMask; // Just a placeholder.
	NonceMask.push_back(0);
	if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
	/Depth/ 1, /HasVarMask/ false, DAG,
	DCI, Subtarget))
	return SDValue(); // This routine will use CombineTo to replace N.
	}

	// Constant Folding.
	APInt UndefElts;
	SmallVector<APInt, 32> EltBits;
	if (N->isOnlyUserOf(N0.getNode()) &&
	getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
	assert(EltBits.size() == VT.getVectorNumElements() &&
	"Unexpected shift value type");
	unsigned ShiftImm = ShiftVal.getZExtValue();
	for (APInt &Elt : EltBits) {
	if (X86ISD::VSHLI == Opcode)
	Elt <<= ShiftImm;
	else if (X86ISD::VSRAI == Opcode)
	Elt.ashrInPlace(ShiftImm);
	else
	Elt.lshrInPlace(ShiftImm);
	}
	return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
	}

	return SDValue();
	}

	static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	assert(
	((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) \|\|
	(N->getOpcode() == X86ISD::PINSRW &&
	N->getValueType(0) == MVT::v8i16)) &&
	"Unexpected vector insertion");

	// Attempt to combine PINSRB/PINSRW patterns to a shuffle.
	SDValue Op(N, 0);
	SmallVector<int, 1> NonceMask; // Just a placeholder.
	NonceMask.push_back(0);
	combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
	/Depth/ 1, /HasVarMask/ false, DAG,
	DCI, Subtarget);
	return SDValue();
	}

	/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
	/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
	/// OR -> CMPNEQSS.
	static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	unsigned opcode;

	// SSE1 supports CMP{eq\|ne}SS, and SSE2 added CMP{eq\|ne}SD, but
	// we're requiring SSE2 for both.
	if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDValue CMP0 = N0->getOperand(1);
	SDValue CMP1 = N1->getOperand(1);
	SDLoc DL(N);

	// The SETCCs should both refer to the same CMP.
	if (CMP0.getOpcode() != X86ISD::CMP \|\| CMP0 != CMP1)
	return SDValue();

	SDValue CMP00 = CMP0->getOperand(0);
	SDValue CMP01 = CMP0->getOperand(1);
	EVT VT = CMP00.getValueType();

	if (VT == MVT::f32 \|\| VT == MVT::f64) {
	bool ExpectingFlags = false;
	// Check for any users that want flags:
	for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
	!ExpectingFlags && UI != UE; ++UI)
	switch (UI->getOpcode()) {
	default:
	case ISD::BR_CC:
	case ISD::BRCOND:
	case ISD::SELECT:
	ExpectingFlags = true;
	break;
	case ISD::CopyToReg:
	case ISD::SIGN_EXTEND:
	case ISD::ZERO_EXTEND:
	case ISD::ANY_EXTEND:
	break;
	}

	if (!ExpectingFlags) {
	enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
	enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);

	if (cc1 == X86::COND_E \|\| cc1 == X86::COND_NE) {
	X86::CondCode tmp = cc0;
	cc0 = cc1;
	cc1 = tmp;
	}

	if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) \|\|
	(cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
	// FIXME: need symbolic constants for these magic numbers.
	// See X86ATTInstPrinter.cpp:printSSECC().
	unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
	if (Subtarget.hasAVX512()) {
	SDValue FSetCC =
	DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
	DAG.getConstant(x86cc, DL, MVT::i8));
	return DAG.getNode(X86ISD::VEXTRACT, DL, N->getSimpleValueType(0),
	FSetCC, DAG.getIntPtrConstant(0, DL));
	}
	SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
	CMP00.getValueType(), CMP00, CMP01,
	DAG.getConstant(x86cc, DL,
	MVT::i8));

	bool is64BitFP = (CMP00.getValueType() == MVT::f64);
	MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;

	if (is64BitFP && !Subtarget.is64Bit()) {
	// On a 32-bit target, we cannot bitcast the 64-bit float to a
	// 64-bit integer, since that's not a legal type. Since
	// OnesOrZeroesF is all ones of all zeroes, we don't need all the
	// bits, but can do this little dance to extract the lowest 32 bits
	// and work with those going forward.
	SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
	OnesOrZeroesF);
	SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
	OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
	Vector32, DAG.getIntPtrConstant(0, DL));
	IntVT = MVT::i32;
	}

	SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
	SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
	DAG.getConstant(1, DL, IntVT));
	SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
	ANDed);
	return OneBitOfTruth;
	}
	}
	}
	}
	return SDValue();
	}

	/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
	static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
	assert(N->getOpcode() == ISD::AND);

	EVT VT = N->getValueType(0);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDLoc DL(N);

	if (VT != MVT::v2i64 && VT != MVT::v4i64 && VT != MVT::v8i64)
	return SDValue();

	if (N0.getOpcode() == ISD::XOR &&
	ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
	return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);

	if (N1.getOpcode() == ISD::XOR &&
	ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
	return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);

	return SDValue();
	}

	// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
	// register. In most cases we actually compare or select YMM-sized registers
	// and mixing the two types creates horrible code. This method optimizes
	// some of the transition sequences.
	static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	if (!VT.is256BitVector())
	return SDValue();

	assert((N->getOpcode() == ISD::ANY_EXTEND \|\|
	N->getOpcode() == ISD::ZERO_EXTEND \|\|
	N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");

	SDValue Narrow = N->getOperand(0);
	EVT NarrowVT = Narrow->getValueType(0);
	if (!NarrowVT.is128BitVector())
	return SDValue();

	if (Narrow->getOpcode() != ISD::XOR &&
	Narrow->getOpcode() != ISD::AND &&
	Narrow->getOpcode() != ISD::OR)
	return SDValue();

	SDValue N0 = Narrow->getOperand(0);
	SDValue N1 = Narrow->getOperand(1);
	SDLoc DL(Narrow);

	// The Left side has to be a trunc.
	if (N0.getOpcode() != ISD::TRUNCATE)
	return SDValue();

	// The type of the truncated inputs.
	EVT WideVT = N0->getOperand(0)->getValueType(0);
	if (WideVT != VT)
	return SDValue();

	// The right side has to be a 'trunc' or a constant vector.
	bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
	ConstantSDNode *RHSConstSplat = nullptr;
	if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))
	RHSConstSplat = RHSBV->getConstantSplatNode();
	if (!RHSTrunc && !RHSConstSplat)
	return SDValue();

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
	return SDValue();

	// Set N0 and N1 to hold the inputs to the new wide operation.
	N0 = N0->getOperand(0);
	if (RHSConstSplat) {
	N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getVectorElementType(),
	SDValue(RHSConstSplat, 0));
	N1 = DAG.getSplatBuildVector(WideVT, DL, N1);
	} else if (RHSTrunc) {
	N1 = N1->getOperand(0);
	}

	// Generate the wide operation.
	SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
	unsigned Opcode = N->getOpcode();
	switch (Opcode) {
	case ISD::ANY_EXTEND:
	return Op;
	case ISD::ZERO_EXTEND: {
	unsigned InBits = NarrowVT.getScalarSizeInBits();
	APInt Mask = APInt::getAllOnesValue(InBits);
	Mask = Mask.zext(VT.getScalarSizeInBits());
	return DAG.getNode(ISD::AND, DL, VT,
	Op, DAG.getConstant(Mask, DL, VT));
	}
	case ISD::SIGN_EXTEND:
	return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
	Op, DAG.getValueType(NarrowVT));
	default:
	llvm_unreachable("Unexpected opcode");
	}
	}

	/// If both input operands of a logic op are being cast from floating point
	/// types, try to convert this into a floating point logic node to avoid
	/// unnecessary moves from SSE to integer registers.
	static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	unsigned FPOpcode = ISD::DELETED_NODE;
	if (N->getOpcode() == ISD::AND)
	FPOpcode = X86ISD::FAND;
	else if (N->getOpcode() == ISD::OR)
	FPOpcode = X86ISD::FOR;
	else if (N->getOpcode() == ISD::XOR)
	FPOpcode = X86ISD::FXOR;

	assert(FPOpcode != ISD::DELETED_NODE &&
	"Unexpected input node for FP logic conversion");

	EVT VT = N->getValueType(0);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDLoc DL(N);
	if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
	((Subtarget.hasSSE1() && VT == MVT::i32) \|\|
	(Subtarget.hasSSE2() && VT == MVT::i64))) {
	SDValue N00 = N0.getOperand(0);
	SDValue N10 = N1.getOperand(0);
	EVT N00Type = N00.getValueType();
	EVT N10Type = N10.getValueType();
	if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) {
	SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
	return DAG.getBitcast(VT, FPLogic);
	}
	}
	return SDValue();
	}

	/// If this is a zero/all-bits result that is bitwise-anded with a low bits
	/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
	/// with a shift-right to eliminate loading the vector constant mask value.
	static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
	SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
	EVT VT0 = Op0.getValueType();
	EVT VT1 = Op1.getValueType();

	if (VT0 != VT1 \|\| !VT0.isSimple() \|\| !VT0.isInteger())
	return SDValue();

	APInt SplatVal;
	if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) \|\|
	!SplatVal.isMask())
	return SDValue();

	if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
	return SDValue();

	unsigned EltBitWidth = VT0.getScalarSizeInBits();
	if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
	return SDValue();

	SDLoc DL(N);
	unsigned ShiftVal = SplatVal.countTrailingOnes();
	SDValue ShAmt = DAG.getConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
	SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
	return DAG.getBitcast(N->getValueType(0), Shift);
	}

	static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
	return R;

	if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
	return FPLogic;

	if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
	return R;

	if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
	return ShiftRight;

	EVT VT = N->getValueType(0);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDLoc DL(N);

	// Attempt to recursively combine a bitmask AND with shuffles.
	if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
	SDValue Op(N, 0);
	SmallVector<int, 1> NonceMask; // Just a placeholder.
	NonceMask.push_back(0);
	if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
	/Depth/ 1, /HasVarMask/ false, DAG,
	DCI, Subtarget))
	return SDValue(); // This routine will use CombineTo to replace N.
	}

	// Create BEXTR instructions
	// BEXTR is ((X >> imm) & (2**size-1))
	if (VT != MVT::i32 && VT != MVT::i64)
	return SDValue();

	if (!Subtarget.hasBMI() && !Subtarget.hasTBM())
	return SDValue();
	if (N0.getOpcode() != ISD::SRA && N0.getOpcode() != ISD::SRL)
	return SDValue();

	ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
	ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
	if (MaskNode && ShiftNode) {
	uint64_t Mask = MaskNode->getZExtValue();
	uint64_t Shift = ShiftNode->getZExtValue();
	if (isMask_64(Mask)) {
	uint64_t MaskSize = countPopulation(Mask);
	if (Shift + MaskSize <= VT.getSizeInBits())
	return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
	DAG.getConstant(Shift \| (MaskSize << 8), DL,
	VT));
	}
	}
	return SDValue();
	}

	// Try to fold:
	// (or (and (m, y), (pandn m, x)))
	// into:
	// (vselect m, x, y)
	// As a special case, try to fold:
	// (or (and (m, (sub 0, x)), (pandn m, x)))
	// into:
	// (sub (xor X, M), M)
	static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);

	if (!((VT.is128BitVector() && Subtarget.hasSSE2()) \|\|
	(VT.is256BitVector() && Subtarget.hasInt256())))
	return SDValue();

	// Canonicalize AND to LHS.
	if (N1.getOpcode() == ISD::AND)
	std::swap(N0, N1);

	// TODO: Attempt to match against AND(XOR(-1,X),Y) as well, waiting for
	// ANDNP combine allows other combines to happen that prevent matching.
	if (N0.getOpcode() != ISD::AND \|\| N1.getOpcode() != X86ISD::ANDNP)
	return SDValue();

	SDValue Mask = N1.getOperand(0);
	SDValue X = N1.getOperand(1);
	SDValue Y;
	if (N0.getOperand(0) == Mask)
	Y = N0.getOperand(1);
	if (N0.getOperand(1) == Mask)
	Y = N0.getOperand(0);

	// Check to see if the mask appeared in both the AND and ANDNP.
	if (!Y.getNode())
	return SDValue();

	// Validate that X, Y, and Mask are bitcasts, and see through them.
	Mask = peekThroughBitcasts(Mask);
	X = peekThroughBitcasts(X);
	Y = peekThroughBitcasts(Y);

	EVT MaskVT = Mask.getValueType();
	unsigned EltBits = MaskVT.getScalarSizeInBits();

	// TODO: Attempt to handle floating point cases as well?
	if (!MaskVT.isInteger() \|\| DAG.ComputeNumSignBits(Mask) != EltBits)
	return SDValue();

	SDLoc DL(N);

	// Try to match:
	// (or (and (M, (sub 0, X)), (pandn M, X)))
	// which is a special case of vselect:
	// (vselect M, (sub 0, X), X)
	// Per:
	// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
	// We know that, if fNegate is 0 or 1:
	// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
	//
	// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
	// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
	// ( M ? -X : X) == ((X ^ M ) + (M & 1))
	// This lets us transform our vselect to:
	// (add (xor X, M), (and M, 1))
	// And further to:
	// (sub (xor X, M), M)
	if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT &&
	DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) {
	auto IsNegV = [](SDNode *N, SDValue V) {
	return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
	ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
	};
	SDValue V;
	if (IsNegV(Y.getNode(), X))
	V = X;
	else if (IsNegV(X.getNode(), Y))
	V = Y;

	if (V) {
	SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
	SDValue SubOp2 = Mask;

	// If the negate was on the false side of the select, then
	// the operands of the SUB need to be swapped. PR 27251.
	// This is because the pattern being matched above is
	// (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
	// but if the pattern matched was
	// (vselect M, X, (sub (0, X))), that is really negation of the pattern
	// above, -(vselect M, (sub 0, X), X), and therefore the replacement
	// pattern also needs to be a negation of the replacement pattern above.
	// And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
	// sub accomplishes the negation of the replacement pattern.
	if (V == Y)
	std::swap(SubOp1, SubOp2);

	SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
	return DAG.getBitcast(VT, Res);
	}
	}

	// PBLENDVB is only available on SSE 4.1.
	if (!Subtarget.hasSSE41())
	return SDValue();

	MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;

	X = DAG.getBitcast(BlendVT, X);
	Y = DAG.getBitcast(BlendVT, Y);
	Mask = DAG.getBitcast(BlendVT, Mask);
	Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
	return DAG.getBitcast(VT, Mask);
	}

	// Helper function for combineOrCmpEqZeroToCtlzSrl
	// Transforms:
	// seteq(cmp x, 0)
	// into:
	// srl(ctlz x), log2(bitsize(x))
	// Input pattern is checked by caller.
	static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
	SelectionDAG &DAG) {
	SDValue Cmp = Op.getOperand(1);
	EVT VT = Cmp.getOperand(0).getValueType();
	unsigned Log2b = Log2_32(VT.getSizeInBits());
	SDLoc dl(Op);
	SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
	// The result of the shift is true or false, and on X86, the 32-bit
	// encoding of shr and lzcnt is more desirable.
	SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
	SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
	DAG.getConstant(Log2b, dl, VT));
	return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
	}

	// Try to transform:
	// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
	// into:
	// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
	// Will also attempt to match more generic cases, eg:
	// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
	// Only applies if the target supports the FastLZCNT feature.
	static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (DCI.isBeforeLegalize() \|\| !Subtarget.getTargetLowering()->isCtlzFast())
	return SDValue();

	auto isORCandidate = [](SDValue N) {
	return (N->getOpcode() == ISD::OR && N->hasOneUse());
	};

	// Check the zero extend is extending to 32-bit or more. The code generated by
	// srl(ctlz) for 16-bit or less variants of the pattern would require extra
	// instructions to clear the upper bits.
	if (!N->hasOneUse() \|\| !N->getSimpleValueType(0).bitsGE(MVT::i32) \|\|
	!isORCandidate(N->getOperand(0)))
	return SDValue();

	// Check the node matches: setcc(eq, cmp 0)
	auto isSetCCCandidate = [](SDValue N) {
	return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
	X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
	N->getOperand(1).getOpcode() == X86ISD::CMP &&
	isNullConstant(N->getOperand(1).getOperand(1)) &&
	N->getOperand(1).getValueType().bitsGE(MVT::i32);
	};

	SDNode *OR = N->getOperand(0).getNode();
	SDValue LHS = OR->getOperand(0);
	SDValue RHS = OR->getOperand(1);

	// Save nodes matching or(or, setcc(eq, cmp 0)).
	SmallVector<SDNode *, 2> ORNodes;
	while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) \|\|
	(isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
	ORNodes.push_back(OR);
	OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
	LHS = OR->getOperand(0);
	RHS = OR->getOperand(1);
	}

	// The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
	if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) \|\|
	!isORCandidate(SDValue(OR, 0)))
	return SDValue();

	// We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
	// to
	// or(srl(ctlz),srl(ctlz)).
	// The dag combiner can then fold it into:
	// srl(or(ctlz, ctlz)).
	EVT VT = OR->getValueType(0);
	SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
	SDValue Ret, NewRHS;
	if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
	Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);

	if (!Ret)
	return SDValue();

	// Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
	while (ORNodes.size() > 0) {
	OR = ORNodes.pop_back_val();
	LHS = OR->getOperand(0);
	RHS = OR->getOperand(1);
	// Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
	if (RHS->getOpcode() == ISD::OR)
	std::swap(LHS, RHS);
	EVT VT = OR->getValueType(0);
	SDValue NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
	if (!NewRHS)
	return SDValue();
	Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
	}

	if (Ret)
	Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);

	return Ret;
	}

	static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
	return R;

	if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
	return FPLogic;

	if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
	return R;

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);

	if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
	return SDValue();

	// fold (or (x << c) \| (y >> (64 - c))) ==> (shld64 x, y, c)
	bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();

	// SHLD/SHRD instructions have lower register pressure, but on some
	// platforms they have higher latency than the equivalent
	// series of shifts/or that would otherwise be generated.
	// Don't fold (or (x << c) \| (y >> (64 - c))) if SHLD/SHRD instructions
	// have higher latencies and we are not optimizing for size.
	if (!OptForSize && Subtarget.isSHLDSlow())
	return SDValue();

	if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
	std::swap(N0, N1);
	if (N0.getOpcode() != ISD::SHL \|\| N1.getOpcode() != ISD::SRL)
	return SDValue();
	if (!N0.hasOneUse() \|\| !N1.hasOneUse())
	return SDValue();

	SDValue ShAmt0 = N0.getOperand(1);
	if (ShAmt0.getValueType() != MVT::i8)
	return SDValue();
	SDValue ShAmt1 = N1.getOperand(1);
	if (ShAmt1.getValueType() != MVT::i8)
	return SDValue();
	if (ShAmt0.getOpcode() == ISD::TRUNCATE)
	ShAmt0 = ShAmt0.getOperand(0);
	if (ShAmt1.getOpcode() == ISD::TRUNCATE)
	ShAmt1 = ShAmt1.getOperand(0);

	SDLoc DL(N);
	unsigned Opc = X86ISD::SHLD;
	SDValue Op0 = N0.getOperand(0);
	SDValue Op1 = N1.getOperand(0);
	if (ShAmt0.getOpcode() == ISD::SUB \|\|
	ShAmt0.getOpcode() == ISD::XOR) {
	Opc = X86ISD::SHRD;
	std::swap(Op0, Op1);
	std::swap(ShAmt0, ShAmt1);
	}

	// OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> SHLD( X, Y, C )
	// OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> SHRD( X, Y, C )
	// OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> SHLD( X, Y, C )
	// OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> SHRD( X, Y, C )
	unsigned Bits = VT.getSizeInBits();
	if (ShAmt1.getOpcode() == ISD::SUB) {
	SDValue Sum = ShAmt1.getOperand(0);
	if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
	SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
	if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
	ShAmt1Op1 = ShAmt1Op1.getOperand(0);
	if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
	return DAG.getNode(Opc, DL, VT,
	Op0, Op1,
	DAG.getNode(ISD::TRUNCATE, DL,
	MVT::i8, ShAmt0));
	}
	} else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
	ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
	if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)
	return DAG.getNode(Opc, DL, VT,
	N0.getOperand(0), N1.getOperand(0),
	DAG.getNode(ISD::TRUNCATE, DL,
	MVT::i8, ShAmt0));
	} else if (ShAmt1.getOpcode() == ISD::XOR) {
	SDValue Mask = ShAmt1.getOperand(1);
	if (ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
	unsigned InnerShift = (X86ISD::SHLD == Opc ? ISD::SRL : ISD::SHL);
	SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
	if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
	ShAmt1Op0 = ShAmt1Op0.getOperand(0);
	if (MaskC->getSExtValue() == (Bits - 1) && ShAmt1Op0 == ShAmt0) {
	if (Op1.getOpcode() == InnerShift &&
	isa<ConstantSDNode>(Op1.getOperand(1)) &&
	Op1.getConstantOperandVal(1) == 1) {
	return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
	DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
	}
	// Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
	if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&
	Op1.getOperand(0) == Op1.getOperand(1)) {
	return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
	DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
	}
	}
	}
	}

	return SDValue();
	}

	/// Generate NEG and CMOV for integer abs.
	static SDValue combineIntegerAbs(SDNode *N, SelectionDAG &DAG) {
	EVT VT = N->getValueType(0);

	// Since X86 does not have CMOV for 8-bit integer, we don't convert
	// 8-bit integer abs to NEG and CMOV.
	if (VT.isInteger() && VT.getSizeInBits() == 8)
	return SDValue();

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDLoc DL(N);

	// Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
	// and change it to SUB and CMOV.
	if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
	N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
	N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0)) {
	auto *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
	if (Y1C && Y1C->getAPIntValue() == VT.getSizeInBits() - 1) {
	// Generate SUB & CMOV.
	SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
	DAG.getConstant(0, DL, VT), N0.getOperand(0));
	SDValue Ops[] = {N0.getOperand(0), Neg,
	DAG.getConstant(X86::COND_GE, DL, MVT::i8),
	SDValue(Neg.getNode(), 1)};
	return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
	}
	}
	return SDValue();
	}

	/// Try to turn tests against the signbit in the form of:
	/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
	/// into:
	/// SETGT(X, -1)
	static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
	// This is only worth doing if the output type is i8 or i1.
	EVT ResultType = N->getValueType(0);
	if (ResultType != MVT::i8 && ResultType != MVT::i1)
	return SDValue();

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// We should be performing an xor against a truncated shift.
	if (N0.getOpcode() != ISD::TRUNCATE \|\| !N0.hasOneUse())
	return SDValue();

	// Make sure we are performing an xor against one.
	if (!isOneConstant(N1))
	return SDValue();

	// SetCC on x86 zero extends so only act on this if it's a logical shift.
	SDValue Shift = N0.getOperand(0);
	if (Shift.getOpcode() != ISD::SRL \|\| !Shift.hasOneUse())
	return SDValue();

	// Make sure we are truncating from one of i16, i32 or i64.
	EVT ShiftTy = Shift.getValueType();
	if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
	return SDValue();

	// Make sure the shift amount extracts the sign bit.
	if (!isa<ConstantSDNode>(Shift.getOperand(1)) \|\|
	Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1)
	return SDValue();

	// Create a greater-than comparison against -1.
	// N.B. Using SETGE against 0 works but we want a canonical looking
	// comparison, using SETGT matches up with what TranslateX86CC.
	SDLoc DL(N);
	SDValue ShiftOp = Shift.getOperand(0);
	EVT ShiftOpTy = ShiftOp.getValueType();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
	*DAG.getContext(), ResultType);
	SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
	DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
	if (SetCCResultType != ResultType)
	Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
	return Cond;
	}

	/// Turn vector tests of the signbit in the form of:
	/// xor (sra X, elt_size(X)-1), -1
	/// into:
	/// pcmpgt X, -1
	///
	/// This should be called before type legalization because the pattern may not
	/// persist after that.
	static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	if (!VT.isSimple())
	return SDValue();

	switch (VT.getSimpleVT().SimpleTy) {
	default: return SDValue();
	case MVT::v16i8:
	case MVT::v8i16:
	case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break;
	case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break;
	case MVT::v32i8:
	case MVT::v16i16:
	case MVT::v8i32:
	case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
	}

	// There must be a shift right algebraic before the xor, and the xor must be a
	// 'not' operation.
	SDValue Shift = N->getOperand(0);
	SDValue Ones = N->getOperand(1);
	if (Shift.getOpcode() != ISD::SRA \|\| !Shift.hasOneUse() \|\|
	!ISD::isBuildVectorAllOnes(Ones.getNode()))
	return SDValue();

	// The shift should be smearing the sign bit across each vector element.
	auto *ShiftBV = dyn_cast<BuildVectorSDNode>(Shift.getOperand(1));
	if (!ShiftBV)
	return SDValue();

	EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
	auto *ShiftAmt = ShiftBV->getConstantSplatNode();
	if (!ShiftAmt \|\| ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
	return SDValue();

	// Create a greater-than comparison against -1. We don't use the more obvious
	// greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
	return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
	}

	/// Check if truncation with saturation form type \p SrcVT to \p DstVT
	/// is valid for the given \p Subtarget.
	static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT,
	const X86Subtarget &Subtarget) {
	if (!Subtarget.hasAVX512())
	return false;

	// FIXME: Scalar type may be supported if we move it to vector register.
	if (!SrcVT.isVector() \|\| !SrcVT.isSimple() \|\| SrcVT.getSizeInBits() > 512)
	return false;

	EVT SrcElVT = SrcVT.getScalarType();
	EVT DstElVT = DstVT.getScalarType();
	if (SrcElVT.getSizeInBits() < 16 \|\| SrcElVT.getSizeInBits() > 64)
	return false;
	if (DstElVT.getSizeInBits() < 8 \|\| DstElVT.getSizeInBits() > 32)
	return false;
	if (SrcVT.is512BitVector() \|\| Subtarget.hasVLX())
	return SrcElVT.getSizeInBits() >= 32 \|\| Subtarget.hasBWI();
	return false;
	}

	/// Detect a pattern of truncation with saturation:
	/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
	/// Return the source value to be truncated or SDValue() if the pattern was not
	/// matched.
	static SDValue detectUSatPattern(SDValue In, EVT VT) {
	if (In.getOpcode() != ISD::UMIN)
	return SDValue();

	//Saturation with truncation. We truncate from InVT to VT.
	assert(In.getScalarValueSizeInBits() > VT.getScalarSizeInBits() &&
	"Unexpected types for truncate operation");

	APInt C;
	if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C)) {
	// C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
	// the element size of the destination type.
	return C.isMask(VT.getScalarSizeInBits()) ? In.getOperand(0) :
	SDValue();
	}
	return SDValue();
	}

	/// Detect a pattern of truncation with saturation:
	/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
	/// The types should allow to use VPMOVUS* instruction on AVX512.
	/// Return the source value to be truncated or SDValue() if the pattern was not
	/// matched.
	static SDValue detectAVX512USatPattern(SDValue In, EVT VT,
	const X86Subtarget &Subtarget) {
	if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
	return SDValue();
	return detectUSatPattern(In, VT);
	}

	static SDValue
	combineTruncateWithUSat(SDValue In, EVT VT, SDLoc &DL, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (!TLI.isTypeLegal(In.getValueType()) \|\| !TLI.isTypeLegal(VT))
	return SDValue();
	if (auto USatVal = detectUSatPattern(In, VT))
	if (isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
	return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
	return SDValue();
	}

	/// This function detects the AVG pattern between vectors of unsigned i8/i16,
	/// which is c = (a + b + 1) / 2, and replace this operation with the efficient
	/// X86ISD::AVG instruction.
	static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	const SDLoc &DL) {
	if (!VT.isVector() \|\| !VT.isSimple())
	return SDValue();
	EVT InVT = In.getValueType();
	unsigned NumElems = VT.getVectorNumElements();

	EVT ScalarVT = VT.getVectorElementType();
	if (!((ScalarVT == MVT::i8 \|\| ScalarVT == MVT::i16) &&
	isPowerOf2_32(NumElems)))
	return SDValue();

	// InScalarVT is the intermediate type in AVG pattern and it should be greater
	// than the original input type (i8/i16).
	EVT InScalarVT = InVT.getVectorElementType();
	if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
	return SDValue();

	if (!Subtarget.hasSSE2())
	return SDValue();
	if (Subtarget.hasBWI()) {
	if (VT.getSizeInBits() > 512)
	return SDValue();
	} else if (Subtarget.hasAVX2()) {
	if (VT.getSizeInBits() > 256)
	return SDValue();
	} else {
	if (VT.getSizeInBits() > 128)
	return SDValue();
	}

	// Detect the following pattern:
	//
	// %1 = zext <N x i8> %a to <N x i32>
	// %2 = zext <N x i8> %b to <N x i32>
	// %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
	// %4 = add nuw nsw <N x i32> %3, %2
	// %5 = lshr <N x i32> %N, <i32 1 x N>
	// %6 = trunc <N x i32> %5 to <N x i8>
	//
	// In AVX512, the last instruction can also be a trunc store.

	if (In.getOpcode() != ISD::SRL)
	return SDValue();

	// A lambda checking the given SDValue is a constant vector and each element
	// is in the range [Min, Max].
	auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
	BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
	if (!BV \|\| !BV->isConstant())
	return false;
	for (SDValue Op : V->ops()) {
	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
	if (!C)
	return false;
	uint64_t Val = C->getZExtValue();
	if (Val < Min \|\| Val > Max)
	return false;
	}
	return true;
	};

	// Check if each element of the vector is left-shifted by one.
	auto LHS = In.getOperand(0);
	auto RHS = In.getOperand(1);
	if (!IsConstVectorInRange(RHS, 1, 1))
	return SDValue();
	if (LHS.getOpcode() != ISD::ADD)
	return SDValue();

	// Detect a pattern of a + b + 1 where the order doesn't matter.
	SDValue Operands[3];
	Operands[0] = LHS.getOperand(0);
	Operands[1] = LHS.getOperand(1);

	// Take care of the case when one of the operands is a constant vector whose
	// element is in the range [1, 256].
	if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
	Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
	Operands[0].getOperand(0).getValueType() == VT) {
	// The pattern is detected. Subtract one from the constant vector, then
	// demote it and emit X86ISD::AVG instruction.
	SDValue VecOnes = DAG.getConstant(1, DL, InVT);
	Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
	Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
	return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
	Operands[1]);
	}

	if (Operands[0].getOpcode() == ISD::ADD)
	std::swap(Operands[0], Operands[1]);
	else if (Operands[1].getOpcode() != ISD::ADD)
	return SDValue();
	Operands[2] = Operands[1].getOperand(0);
	Operands[1] = Operands[1].getOperand(1);

	// Now we have three operands of two additions. Check that one of them is a
	// constant vector with ones, and the other two are promoted from i8/i16.
	for (int i = 0; i < 3; ++i) {
	if (!IsConstVectorInRange(Operands[i], 1, 1))
	continue;
	std::swap(Operands[i], Operands[2]);

	// Check if Operands[0] and Operands[1] are results of type promotion.
	for (int j = 0; j < 2; ++j)
	if (Operands[j].getOpcode() != ISD::ZERO_EXTEND \|\|
	Operands[j].getOperand(0).getValueType() != VT)
	return SDValue();

	// The pattern is detected, emit X86ISD::AVG instruction.
	return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
	Operands[1].getOperand(0));
	}

	return SDValue();
	}

	static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	LoadSDNode *Ld = cast<LoadSDNode>(N);
	EVT RegVT = Ld->getValueType(0);
	EVT MemVT = Ld->getMemoryVT();
	SDLoc dl(Ld);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// For chips with slow 32-byte unaligned loads, break the 32-byte operation
	// into two 16-byte operations. Also split non-temporal aligned loads on
	// pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
	ISD::LoadExtType Ext = Ld->getExtensionType();
	bool Fast;
	unsigned AddressSpace = Ld->getAddressSpace();
	unsigned Alignment = Ld->getAlignment();
	if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
	Ext == ISD::NON_EXTLOAD &&
	((Ld->isNonTemporal() && !Subtarget.hasInt256() && Alignment >= 16) \|\|
	(TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
	AddressSpace, Alignment, &Fast) && !Fast))) {
	unsigned NumElems = RegVT.getVectorNumElements();
	if (NumElems < 2)
	return SDValue();

	SDValue Ptr = Ld->getBasePtr();

	EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
	NumElems/2);
	SDValue Load1 =
	DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
	Alignment, Ld->getMemOperand()->getFlags());

	Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl);
	SDValue Load2 =
	DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
	std::min(16U, Alignment), Ld->getMemOperand()->getFlags());
	SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
	Load1.getValue(1),
	Load2.getValue(1));

	SDValue NewVec = DAG.getUNDEF(RegVT);
	NewVec = insert128BitVector(NewVec, Load1, 0, DAG, dl);
	NewVec = insert128BitVector(NewVec, Load2, NumElems / 2, DAG, dl);
	return DCI.CombineTo(N, NewVec, TF, true);
	}

	return SDValue();
	}

	/// If V is a build vector of boolean constants and exactly one of those
	/// constants is true, return the operand index of that true element.
	/// Otherwise, return -1.
	static int getOneTrueElt(SDValue V) {
	// This needs to be a build vector of booleans.
	// TODO: Checking for the i1 type matches the IR definition for the mask,
	// but the mask check could be loosened to i8 or other types. That might
	// also require checking more than 'allOnesValue'; eg, the x86 HW
	// instructions only require that the MSB is set for each mask element.
	// The ISD::MSTORE comments/definition do not specify how the mask operand
	// is formatted.
	auto *BV = dyn_cast<BuildVectorSDNode>(V);
	if (!BV \|\| BV->getValueType(0).getVectorElementType() != MVT::i1)
	return -1;

	int TrueIndex = -1;
	unsigned NumElts = BV->getValueType(0).getVectorNumElements();
	for (unsigned i = 0; i < NumElts; ++i) {
	const SDValue &Op = BV->getOperand(i);
	if (Op.isUndef())
	continue;
	auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
	if (!ConstNode)
	return -1;
	if (ConstNode->getAPIntValue().isAllOnesValue()) {
	// If we already found a one, this is too many.
	if (TrueIndex >= 0)
	return -1;
	TrueIndex = i;
	}
	}
	return TrueIndex;
	}

	/// Given a masked memory load/store operation, return true if it has one mask
	/// bit set. If it has one mask bit set, then also return the memory address of
	/// the scalar element to load/store, the vector index to insert/extract that
	/// scalar element, and the alignment for the scalar memory access.
	static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
	SelectionDAG &DAG, SDValue &Addr,
	SDValue &Index, unsigned &Alignment) {
	int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
	if (TrueMaskElt < 0)
	return false;

	// Get the address of the one scalar element that is specified by the mask
	// using the appropriate offset from the base pointer.
	EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
	Addr = MaskedOp->getBasePtr();
	if (TrueMaskElt != 0) {
	unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
	Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
	}

	Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
	Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
	return true;
	}

	/// If exactly one element of the mask is set for a non-extending masked load,
	/// it is a scalar load and vector insert.
	/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
	/// mask have already been optimized in IR, so we don't bother with those here.
	static SDValue
	reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	// TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
	// However, some target hooks may need to be added to know when the transform
	// is profitable. Endianness would also have to be considered.

	SDValue Addr, VecIndex;
	unsigned Alignment;
	if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
	return SDValue();

	// Load the one scalar element that is specified by the mask using the
	// appropriate offset from the base pointer.
	SDLoc DL(ML);
	EVT VT = ML->getValueType(0);
	EVT EltVT = VT.getVectorElementType();
	SDValue Load =
	DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),
	Alignment, ML->getMemOperand()->getFlags());

	// Insert the loaded element into the appropriate place in the vector.
	SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, ML->getSrc0(),
	Load, VecIndex);
	return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
	}

	static SDValue
	combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
	return SDValue();

	SDLoc DL(ML);
	EVT VT = ML->getValueType(0);

	// If we are loading the first and last elements of a vector, it is safe and
	// always faster to load the whole vector. Replace the masked load with a
	// vector load and select.
	unsigned NumElts = VT.getVectorNumElements();
	BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
	bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
	bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
	if (LoadFirstElt && LoadLastElt) {
	SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
	ML->getMemOperand());
	SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getSrc0());
	return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
	}

	// Convert a masked load with a constant mask into a masked load and a select.
	// This allows the select operation to use a faster kind of select instruction
	// (for example, vblendvps -> vblendps).

	// Don't try this if the pass-through operand is already undefined. That would
	// cause an infinite loop because that's what we're about to create.
	if (ML->getSrc0().isUndef())
	return SDValue();

	// The new masked load has an undef pass-through operand. The select uses the
	// original pass-through operand.
	SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
	ML->getMask(), DAG.getUNDEF(VT),
	ML->getMemoryVT(), ML->getMemOperand(),
	ML->getExtensionType());
	SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getSrc0());

	return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
	}

	static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);

	// TODO: Expanding load with constant mask may be optimized as well.
	if (Mld->isExpandingLoad())
	return SDValue();

	if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
	if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
	return ScalarLoad;
	// TODO: Do some AVX512 subsets benefit from this transform?
	if (!Subtarget.hasAVX512())
	if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
	return Blend;
	}

	if (Mld->getExtensionType() != ISD::SEXTLOAD)
	return SDValue();

	// Resolve extending loads.
	EVT VT = Mld->getValueType(0);
	unsigned NumElems = VT.getVectorNumElements();
	EVT LdVT = Mld->getMemoryVT();
	SDLoc dl(Mld);

	assert(LdVT != VT && "Cannot extend to the same type");
	unsigned ToSz = VT.getScalarSizeInBits();
	unsigned FromSz = LdVT.getScalarSizeInBits();
	// From/To sizes and ElemCount must be pow of two.
	assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
	"Unexpected size for extending masked load");

	unsigned SizeRatio = ToSz / FromSz;
	assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());

	// Create a type on which we perform the shuffle.
	EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
	LdVT.getScalarType(), NumElems*SizeRatio);
	assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());

	// Convert Src0 value.
	SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0());
	if (!Mld->getSrc0().isUndef()) {
	SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
	for (unsigned i = 0; i != NumElems; ++i)
	ShuffleVec[i] = i * SizeRatio;

	// Can't shuffle using an illegal type.
	assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
	"WideVecVT should be legal");
	WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
	DAG.getUNDEF(WideVecVT), ShuffleVec);
	}
	// Prepare the new mask.
	SDValue NewMask;
	SDValue Mask = Mld->getMask();
	if (Mask.getValueType() == VT) {
	// Mask and original value have the same type.
	NewMask = DAG.getBitcast(WideVecVT, Mask);
	SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
	for (unsigned i = 0; i != NumElems; ++i)
	ShuffleVec[i] = i * SizeRatio;
	for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
	ShuffleVec[i] = NumElems * SizeRatio;
	NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
	DAG.getConstant(0, dl, WideVecVT),
	ShuffleVec);
	} else {
	assert(Mask.getValueType().getVectorElementType() == MVT::i1);
	unsigned WidenNumElts = NumElems*SizeRatio;
	unsigned MaskNumElts = VT.getVectorNumElements();
	EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
	WidenNumElts);

	unsigned NumConcat = WidenNumElts / MaskNumElts;
	SmallVector<SDValue, 16> Ops(NumConcat);
	SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
	Ops[0] = Mask;
	for (unsigned i = 1; i != NumConcat; ++i)
	Ops[i] = ZeroVal;

	NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
	}

	SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
	Mld->getBasePtr(), NewMask, WideSrc0,
	Mld->getMemoryVT(), Mld->getMemOperand(),
	ISD::NON_EXTLOAD);
	SDValue NewVec = getExtendInVec(X86ISD::VSEXT, dl, VT, WideLd, DAG);
	return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
	}

	/// If exactly one element of the mask is set for a non-truncating masked store,
	/// it is a vector extract and scalar store.
	/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
	/// mask have already been optimized in IR, so we don't bother with those here.
	static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
	SelectionDAG &DAG) {
	// TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
	// However, some target hooks may need to be added to know when the transform
	// is profitable. Endianness would also have to be considered.

	SDValue Addr, VecIndex;
	unsigned Alignment;
	if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
	return SDValue();

	// Extract the one scalar element that is actually being stored.
	SDLoc DL(MS);
	EVT VT = MS->getValue().getValueType();
	EVT EltVT = VT.getVectorElementType();
	SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
	MS->getValue(), VecIndex);

	// Store that element at the appropriate offset from the base pointer.
	return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
	Alignment, MS->getMemOperand()->getFlags());
	}

	static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);

	if (Mst->isCompressingStore())
	return SDValue();

	if (!Mst->isTruncatingStore())
	return reduceMaskedStoreToScalarStore(Mst, DAG);

	// Resolve truncating stores.
	EVT VT = Mst->getValue().getValueType();
	unsigned NumElems = VT.getVectorNumElements();
	EVT StVT = Mst->getMemoryVT();
	SDLoc dl(Mst);

	assert(StVT != VT && "Cannot truncate to the same type");
	unsigned FromSz = VT.getScalarSizeInBits();
	unsigned ToSz = StVT.getScalarSizeInBits();

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// The truncating store is legal in some cases. For example
	// vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
	// are designated for truncate store.
	// In this case we don't need any further transformations.
	if (TLI.isTruncStoreLegal(VT, StVT))
	return SDValue();

	// From/To sizes and ElemCount must be pow of two.
	assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
	"Unexpected size for truncating masked store");
	// We are going to use the original vector elt for storing.
	// Accumulated smaller vector elements must be a multiple of the store size.
	assert (((NumElems * FromSz) % ToSz) == 0 &&
	"Unexpected ratio for truncating masked store");

	unsigned SizeRatio = FromSz / ToSz;
	assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());

	// Create a type on which we perform the shuffle.
	EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
	StVT.getScalarType(), NumElems*SizeRatio);

	assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());

	SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
	SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
	for (unsigned i = 0; i != NumElems; ++i)
	ShuffleVec[i] = i * SizeRatio;

	// Can't shuffle using an illegal type.
	assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
	"WideVecVT should be legal");

	SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
	DAG.getUNDEF(WideVecVT),
	ShuffleVec);

	SDValue NewMask;
	SDValue Mask = Mst->getMask();
	if (Mask.getValueType() == VT) {
	// Mask and original value have the same type.
	NewMask = DAG.getBitcast(WideVecVT, Mask);
	for (unsigned i = 0; i != NumElems; ++i)
	ShuffleVec[i] = i * SizeRatio;
	for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
	ShuffleVec[i] = NumElems*SizeRatio;
	NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
	DAG.getConstant(0, dl, WideVecVT),
	ShuffleVec);
	} else {
	assert(Mask.getValueType().getVectorElementType() == MVT::i1);
	unsigned WidenNumElts = NumElems*SizeRatio;
	unsigned MaskNumElts = VT.getVectorNumElements();
	EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
	WidenNumElts);

	unsigned NumConcat = WidenNumElts / MaskNumElts;
	SmallVector<SDValue, 16> Ops(NumConcat);
	SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
	Ops[0] = Mask;
	for (unsigned i = 1; i != NumConcat; ++i)
	Ops[i] = ZeroVal;

	NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
	}

	return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
	Mst->getBasePtr(), NewMask, StVT,
	Mst->getMemOperand(), false);
	}

	static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	StoreSDNode *St = cast<StoreSDNode>(N);
	EVT VT = St->getValue().getValueType();
	EVT StVT = St->getMemoryVT();
	SDLoc dl(St);
	SDValue StoredVal = St->getOperand(1);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// If we are saving a concatenation of two XMM registers and 32-byte stores
	// are slow, such as on Sandy Bridge, perform two 16-byte stores.
	bool Fast;
	unsigned AddressSpace = St->getAddressSpace();
	unsigned Alignment = St->getAlignment();
	if (VT.is256BitVector() && StVT == VT &&
	TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
	AddressSpace, Alignment, &Fast) &&
	!Fast) {
	unsigned NumElems = VT.getVectorNumElements();
	if (NumElems < 2)
	return SDValue();

	SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl);
	SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl);

	SDValue Ptr0 = St->getBasePtr();
	SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl);

	SDValue Ch0 =
	DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(),
	Alignment, St->getMemOperand()->getFlags());
	SDValue Ch1 =
	DAG.getStore(St->getChain(), dl, Value1, Ptr1, St->getPointerInfo(),
	std::min(16U, Alignment), St->getMemOperand()->getFlags());
	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
	}

	// Optimize trunc store (of multiple scalars) to shuffle and store.
	// First, pack all of the elements in one place. Next, store to memory
	// in fewer chunks.
	if (St->isTruncatingStore() && VT.isVector()) {
	// Check if we can detect an AVG pattern from the truncation. If yes,
	// replace the trunc store by a normal store with the result of X86ISD::AVG
	// instruction.
	if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
	Subtarget, dl))
	return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
	St->getPointerInfo(), St->getAlignment(),
	St->getMemOperand()->getFlags());

	if (SDValue Val =
	detectAVX512USatPattern(St->getValue(), St->getMemoryVT(), Subtarget))
	return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
	dl, Val, St->getBasePtr(),
	St->getMemoryVT(), St->getMemOperand(), DAG);

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	unsigned NumElems = VT.getVectorNumElements();
	assert(StVT != VT && "Cannot truncate to the same type");
	unsigned FromSz = VT.getScalarSizeInBits();
	unsigned ToSz = StVT.getScalarSizeInBits();

	// The truncating store is legal in some cases. For example
	// vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
	// are designated for truncate store.
	// In this case we don't need any further transformations.
	if (TLI.isTruncStoreLegalOrCustom(VT, StVT))
	return SDValue();

	// From, To sizes and ElemCount must be pow of two
	if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
	// We are going to use the original vector elt for storing.
	// Accumulated smaller vector elements must be a multiple of the store size.
	if (0 != (NumElems * FromSz) % ToSz) return SDValue();

	unsigned SizeRatio = FromSz / ToSz;

	assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());

	// Create a type on which we perform the shuffle
	EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
	StVT.getScalarType(), NumElems*SizeRatio);

	assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());

	SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
	SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
	for (unsigned i = 0; i != NumElems; ++i)
	ShuffleVec[i] = i * SizeRatio;

	// Can't shuffle using an illegal type.
	if (!TLI.isTypeLegal(WideVecVT))
	return SDValue();

	SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
	DAG.getUNDEF(WideVecVT),
	ShuffleVec);
	// At this point all of the data is stored at the bottom of the
	// register. We now need to save it to mem.

	// Find the largest store unit
	MVT StoreType = MVT::i8;
	for (MVT Tp : MVT::integer_valuetypes()) {
	if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
	StoreType = Tp;
	}

	// On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
	if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
	(64 <= NumElems * ToSz))
	StoreType = MVT::f64;

	// Bitcast the original vector into a vector of store-size units
	EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
	StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
	assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
	SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
	SmallVector<SDValue, 8> Chains;
	SDValue Ptr = St->getBasePtr();

	// Perform one or more big stores into memory.
	for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
	SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
	StoreType, ShuffWide,
	DAG.getIntPtrConstant(i, dl));
	SDValue Ch =
	DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(),
	St->getAlignment(), St->getMemOperand()->getFlags());
	Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl);
	Chains.push_back(Ch);
	}

	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
	}

	// Turn load->store of MMX types into GPR load/stores. This avoids clobbering
	// the FP state in cases where an emms may be missing.
	// A preferable solution to the general problem is to figure out the right
	// places to insert EMMS. This qualifies as a quick hack.

	// Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
	if (VT.getSizeInBits() != 64)
	return SDValue();

	const Function *F = DAG.getMachineFunction().getFunction();
	bool NoImplicitFloatOps = F->hasFnAttribute(Attribute::NoImplicitFloat);
	bool F64IsLegal =
	!Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
	if ((VT.isVector() \|\|
	(VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
	isa<LoadSDNode>(St->getValue()) &&
	!cast<LoadSDNode>(St->getValue())->isVolatile() &&
	St->getChain().hasOneUse() && !St->isVolatile()) {
	SDNode* LdVal = St->getValue().getNode();
	LoadSDNode *Ld = nullptr;
	int TokenFactorIndex = -1;
	SmallVector<SDValue, 8> Ops;
	SDNode* ChainVal = St->getChain().getNode();
	// Must be a store of a load. We currently handle two cases: the load
	// is a direct child, and it's under an intervening TokenFactor. It is
	// possible to dig deeper under nested TokenFactors.
	if (ChainVal == LdVal)
	Ld = cast<LoadSDNode>(St->getChain());
	else if (St->getValue().hasOneUse() &&
	ChainVal->getOpcode() == ISD::TokenFactor) {
	for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
	if (ChainVal->getOperand(i).getNode() == LdVal) {
	TokenFactorIndex = i;
	Ld = cast<LoadSDNode>(St->getValue());
	} else
	Ops.push_back(ChainVal->getOperand(i));
	}
	}

	if (!Ld \|\| !ISD::isNormalLoad(Ld))
	return SDValue();

	// If this is not the MMX case, i.e. we are just turning i64 load/store
	// into f64 load/store, avoid the transformation if there are multiple
	// uses of the loaded value.
	if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
	return SDValue();

	SDLoc LdDL(Ld);
	SDLoc StDL(N);
	// If we are a 64-bit capable x86, lower to a single movq load/store pair.
	// Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
	// pair instead.
	if (Subtarget.is64Bit() \|\| F64IsLegal) {
	MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
	SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
	Ld->getPointerInfo(), Ld->getAlignment(),
	Ld->getMemOperand()->getFlags());
	- SDValue NewChain = NewLd.getValue(1);
	+ // Make sure new load is placed in same chain order.
	+ SDValue NewChain = DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
	if (TokenFactorIndex >= 0) {
	Ops.push_back(NewChain);
	NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
	}
	return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
	St->getPointerInfo(), St->getAlignment(),
	St->getMemOperand()->getFlags());
	}

	// Otherwise, lower to two pairs of 32-bit loads / stores.
	SDValue LoAddr = Ld->getBasePtr();
	SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL);

	SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
	Ld->getPointerInfo(), Ld->getAlignment(),
	Ld->getMemOperand()->getFlags());
	SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
	Ld->getPointerInfo().getWithOffset(4),
	MinAlign(Ld->getAlignment(), 4),
	Ld->getMemOperand()->getFlags());
	+ // Make sure new loads are placed in same chain order.
	+ SDValue NewChain = DAG.makeEquivalentMemoryOrdering(Ld, LoLd);
	+ NewChain = DAG.makeEquivalentMemoryOrdering(Ld, HiLd);

	- SDValue NewChain = LoLd.getValue(1);
	if (TokenFactorIndex >= 0) {
	- Ops.push_back(LoLd);
	- Ops.push_back(HiLd);
	+ Ops.push_back(NewChain);
	NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
	}

	LoAddr = St->getBasePtr();
	HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL);

	SDValue LoSt =
	DAG.getStore(NewChain, StDL, LoLd, LoAddr, St->getPointerInfo(),
	St->getAlignment(), St->getMemOperand()->getFlags());
	SDValue HiSt = DAG.getStore(
	NewChain, StDL, HiLd, HiAddr, St->getPointerInfo().getWithOffset(4),
	MinAlign(St->getAlignment(), 4), St->getMemOperand()->getFlags());
	return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
	}

	// This is similar to the above case, but here we handle a scalar 64-bit
	// integer store that is extracted from a vector on a 32-bit target.
	// If we have SSE2, then we can treat it like a floating-point double
	// to get past legalization. The execution dependencies fixup pass will
	// choose the optimal machine instruction for the store if this really is
	// an integer or v2f32 rather than an f64.
	if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
	St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
	SDValue OldExtract = St->getOperand(1);
	SDValue ExtOp0 = OldExtract.getOperand(0);
	unsigned VecSize = ExtOp0.getValueSizeInBits();
	EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
	SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
	SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
	BitCast, OldExtract.getOperand(1));
	return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
	St->getPointerInfo(), St->getAlignment(),
	St->getMemOperand()->getFlags());
	}

	return SDValue();
	}

	/// Return 'true' if this vector operation is "horizontal"
	/// and return the operands for the horizontal operation in LHS and RHS. A
	/// horizontal operation performs the binary operation on successive elements
	/// of its first operand, then on successive elements of its second operand,
	/// returning the resulting values in a vector. For example, if
	/// A = < float a0, float a1, float a2, float a3 >
	/// and
	/// B = < float b0, float b1, float b2, float b3 >
	/// then the result of doing a horizontal operation on A and B is
	/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
	/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
	/// A horizontal-op B, for some already available A and B, and if so then LHS is
	/// set to A, RHS to B, and the routine returns 'true'.
	/// Note that the binary operation should have the property that if one of the
	/// operands is UNDEF then the result is UNDEF.
	static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
	// Look for the following pattern: if
	// A = < float a0, float a1, float a2, float a3 >
	// B = < float b0, float b1, float b2, float b3 >
	// and
	// LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
	// RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
	// then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
	// which is A horizontal-op B.

	// At least one of the operands should be a vector shuffle.
	if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
	RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
	return false;

	MVT VT = LHS.getSimpleValueType();

	assert((VT.is128BitVector() \|\| VT.is256BitVector()) &&
	"Unsupported vector type for horizontal add/sub");

	// Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
	// operate independently on 128-bit lanes.
	unsigned NumElts = VT.getVectorNumElements();
	unsigned NumLanes = VT.getSizeInBits()/128;
	unsigned NumLaneElts = NumElts / NumLanes;
	assert((NumLaneElts % 2 == 0) &&
	"Vector type should have an even number of elements in each lane");
	unsigned HalfLaneElts = NumLaneElts/2;

	// View LHS in the form
	// LHS = VECTOR_SHUFFLE A, B, LMask
	// If LHS is not a shuffle then pretend it is the shuffle
	// LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
	// NOTE: in what follows a default initialized SDValue represents an UNDEF of
	// type VT.
	SDValue A, B;
	SmallVector<int, 16> LMask(NumElts);
	if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
	if (!LHS.getOperand(0).isUndef())
	A = LHS.getOperand(0);
	if (!LHS.getOperand(1).isUndef())
	B = LHS.getOperand(1);
	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
	std::copy(Mask.begin(), Mask.end(), LMask.begin());
	} else {
	if (!LHS.isUndef())
	A = LHS;
	for (unsigned i = 0; i != NumElts; ++i)
	LMask[i] = i;
	}

	// Likewise, view RHS in the form
	// RHS = VECTOR_SHUFFLE C, D, RMask
	SDValue C, D;
	SmallVector<int, 16> RMask(NumElts);
	if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
	if (!RHS.getOperand(0).isUndef())
	C = RHS.getOperand(0);
	if (!RHS.getOperand(1).isUndef())
	D = RHS.getOperand(1);
	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
	std::copy(Mask.begin(), Mask.end(), RMask.begin());
	} else {
	if (!RHS.isUndef())
	C = RHS;
	for (unsigned i = 0; i != NumElts; ++i)
	RMask[i] = i;
	}

	// Check that the shuffles are both shuffling the same vectors.
	if (!(A == C && B == D) && !(A == D && B == C))
	return false;

	// If everything is UNDEF then bail out: it would be better to fold to UNDEF.
	if (!A.getNode() && !B.getNode())
	return false;

	// If A and B occur in reverse order in RHS, then "swap" them (which means
	// rewriting the mask).
	if (A != C)
	ShuffleVectorSDNode::commuteMask(RMask);

	// At this point LHS and RHS are equivalent to
	// LHS = VECTOR_SHUFFLE A, B, LMask
	// RHS = VECTOR_SHUFFLE A, B, RMask
	// Check that the masks correspond to performing a horizontal operation.
	for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
	for (unsigned i = 0; i != NumLaneElts; ++i) {
	int LIdx = LMask[i+l], RIdx = RMask[i+l];

	// Ignore any UNDEF components.
	if (LIdx < 0 \|\| RIdx < 0 \|\|
	(!A.getNode() && (LIdx < (int)NumElts \|\| RIdx < (int)NumElts)) \|\|
	(!B.getNode() && (LIdx >= (int)NumElts \|\| RIdx >= (int)NumElts)))
	continue;

	// Check that successive elements are being operated on. If not, this is
	// not a horizontal operation.
	unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
	int Index = 2(i%HalfLaneElts) + NumEltsSrc + l;
	if (!(LIdx == Index && RIdx == Index + 1) &&
	!(IsCommutative && LIdx == Index + 1 && RIdx == Index))
	return false;
	}
	}

	LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
	RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
	return true;
	}

	/// Do target-specific dag combines on floating-point adds/subs.
	static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);
	bool IsFadd = N->getOpcode() == ISD::FADD;
	assert((IsFadd \|\| N->getOpcode() == ISD::FSUB) && "Wrong opcode");

	// Try to synthesize horizontal add/sub from adds/subs of shuffles.
	if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 \|\| VT == MVT::v2f64)) \|\|
	(Subtarget.hasFp256() && (VT == MVT::v8f32 \|\| VT == MVT::v4f64))) &&
	isHorizontalBinOp(LHS, RHS, IsFadd)) {
	auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
	return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
	}
	return SDValue();
	}

	/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
	/// the codegen.
	/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
	static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	SDLoc &DL) {
	assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
	SDValue Src = N->getOperand(0);
	unsigned Opcode = Src.getOpcode();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	EVT VT = N->getValueType(0);
	EVT SrcVT = Src.getValueType();

	auto IsRepeatedOpOrFreeTruncation = [VT](SDValue Op0, SDValue Op1) {
	unsigned TruncSizeInBits = VT.getScalarSizeInBits();

	// Repeated operand, so we are only trading one output truncation for
	// one input truncation.
	if (Op0 == Op1)
	return true;

	// See if either operand has been extended from a smaller/equal size to
	// the truncation size, allowing a truncation to combine with the extend.
	unsigned Opcode0 = Op0.getOpcode();
	if ((Opcode0 == ISD::ANY_EXTEND \|\| Opcode0 == ISD::SIGN_EXTEND \|\|
	Opcode0 == ISD::ZERO_EXTEND) &&
	Op0.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
	return true;

	unsigned Opcode1 = Op1.getOpcode();
	if ((Opcode1 == ISD::ANY_EXTEND \|\| Opcode1 == ISD::SIGN_EXTEND \|\|
	Opcode1 == ISD::ZERO_EXTEND) &&
	Op1.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
	return true;

	// See if either operand is a single use constant which can be constant
	// folded.
	SDValue BC0 = peekThroughOneUseBitcasts(Op0);
	SDValue BC1 = peekThroughOneUseBitcasts(Op1);
	return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) \|\|
	ISD::isBuildVectorOfConstantSDNodes(BC1.getNode());
	};

	auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
	SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
	SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
	return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1);
	};

	// Don't combine if the operation has other uses.
	if (!N->isOnlyUserOf(Src.getNode()))
	return SDValue();

	// Only support vector truncation for now.
	// TODO: i64 scalar math would benefit as well.
	if (!VT.isVector())
	return SDValue();

	// In most cases its only worth pre-truncating if we're only facing the cost
	// of one truncation.
	// i.e. if one of the inputs will constant fold or the input is repeated.
	switch (Opcode) {
	case ISD::AND:
	case ISD::XOR:
	case ISD::OR: {
	SDValue Op0 = Src.getOperand(0);
	SDValue Op1 = Src.getOperand(1);
	if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
	IsRepeatedOpOrFreeTruncation(Op0, Op1))
	return TruncateArithmetic(Op0, Op1);
	break;
	}

	case ISD::MUL:
	// X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
	// better to truncate if we have the chance.
	if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) &&
	!TLI.isOperationLegal(Opcode, SrcVT))
	return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
	LLVM_FALLTHROUGH;
	case ISD::ADD: {
	SDValue Op0 = Src.getOperand(0);
	SDValue Op1 = Src.getOperand(1);
	if (TLI.isOperationLegal(Opcode, VT) &&
	IsRepeatedOpOrFreeTruncation(Op0, Op1))
	return TruncateArithmetic(Op0, Op1);
	break;
	}
	}

	return SDValue();
	}

	/// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
	static SDValue
	combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
	SmallVector<SDValue, 8> &Regs) {
	assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 \|\|
	Regs[0].getValueType() == MVT::v2i64));
	EVT OutVT = N->getValueType(0);
	EVT OutSVT = OutVT.getVectorElementType();
	EVT InVT = Regs[0].getValueType();
	EVT InSVT = InVT.getVectorElementType();
	SDLoc DL(N);

	// First, use mask to unset all bits that won't appear in the result.
	assert((OutSVT == MVT::i8 \|\| OutSVT == MVT::i16) &&
	"OutSVT can only be either i8 or i16.");
	APInt Mask =
	APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits());
	SDValue MaskVal = DAG.getConstant(Mask, DL, InVT);
	for (auto &Reg : Regs)
	Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVal, Reg);

	MVT UnpackedVT, PackedVT;
	if (OutSVT == MVT::i8) {
	UnpackedVT = MVT::v8i16;
	PackedVT = MVT::v16i8;
	} else {
	UnpackedVT = MVT::v4i32;
	PackedVT = MVT::v8i16;
	}

	// In each iteration, truncate the type by a half size.
	auto RegNum = Regs.size();
	for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits();
	j < e; j *= 2, RegNum /= 2) {
	for (unsigned i = 0; i < RegNum; i++)
	Regs[i] = DAG.getBitcast(UnpackedVT, Regs[i]);
	for (unsigned i = 0; i < RegNum / 2; i++)
	Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2],
	Regs[i * 2 + 1]);
	}

	// If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and
	// then extract a subvector as the result since v8i8 is not a legal type.
	if (OutVT == MVT::v8i8) {
	Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]);
	Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0],
	DAG.getIntPtrConstant(0, DL));
	return Regs[0];
	} else if (RegNum > 1) {
	Regs.resize(RegNum);
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
	} else
	return Regs[0];
	}

	/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
	static SDValue
	combineVectorTruncationWithPACKSS(SDNode *N, const X86Subtarget &Subtarget,
	SelectionDAG &DAG,
	SmallVector<SDValue, 8> &Regs) {
	assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32);
	EVT OutVT = N->getValueType(0);
	SDLoc DL(N);

	// Shift left by 16 bits, then arithmetic-shift right by 16 bits.
	SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32);
	for (auto &Reg : Regs) {
	Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt,
	Subtarget, DAG);
	Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt,
	Subtarget, DAG);
	}

	for (unsigned i = 0, e = Regs.size() / 2; i < e; i++)
	Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2],
	Regs[i * 2 + 1]);

	if (Regs.size() > 2) {
	Regs.resize(Regs.size() / 2);
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
	} else
	return Regs[0];
	}

	/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
	/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
	/// legalization the truncation will be translated into a BUILD_VECTOR with each
	/// element that is extracted from a vector and then truncated, and it is
	/// difficult to do this optimization based on them.
	static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT OutVT = N->getValueType(0);
	if (!OutVT.isVector())
	return SDValue();

	SDValue In = N->getOperand(0);
	if (!In.getValueType().isSimple())
	return SDValue();

	EVT InVT = In.getValueType();
	unsigned NumElems = OutVT.getVectorNumElements();

	// TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
	// SSE2, and we need to take care of it specially.
	// AVX512 provides vpmovdb.
	if (!Subtarget.hasSSE2() \|\| Subtarget.hasAVX2())
	return SDValue();

	EVT OutSVT = OutVT.getVectorElementType();
	EVT InSVT = InVT.getVectorElementType();
	if (!((InSVT == MVT::i32 \|\| InSVT == MVT::i64) &&
	(OutSVT == MVT::i8 \|\| OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
	NumElems >= 8))
	return SDValue();

	// SSSE3's pshufb results in less instructions in the cases below.
	if (Subtarget.hasSSSE3() && NumElems == 8 &&
	((OutSVT == MVT::i8 && InSVT != MVT::i64) \|\|
	(InSVT == MVT::i32 && OutSVT == MVT::i16)))
	return SDValue();

	SDLoc DL(N);

	// Split a long vector into vectors of legal type.
	unsigned RegNum = InVT.getSizeInBits() / 128;
	SmallVector<SDValue, 8> SubVec(RegNum);
	unsigned NumSubRegElts = 128 / InSVT.getSizeInBits();
	EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts);

	for (unsigned i = 0; i < RegNum; i++)
	SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In,
	DAG.getIntPtrConstant(i * NumSubRegElts, DL));

	// SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
	// for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
	// truncate 2 x v4i32 to v8i16.
	if (Subtarget.hasSSE41() \|\| OutSVT == MVT::i8)
	return combineVectorTruncationWithPACKUS(N, DAG, SubVec);
	else if (InSVT == MVT::i32)
	return combineVectorTruncationWithPACKSS(N, Subtarget, DAG, SubVec);
	else
	return SDValue();
	}

	/// This function transforms vector truncation of 'all or none' bits values.
	/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS operations.
	static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// Requires SSE2 but AVX512 has fast truncate.
	if (!Subtarget.hasSSE2() \|\| Subtarget.hasAVX512())
	return SDValue();

	if (!N->getValueType(0).isVector() \|\| !N->getValueType(0).isSimple())
	return SDValue();

	SDValue In = N->getOperand(0);
	if (!In.getValueType().isSimple())
	return SDValue();

	MVT VT = N->getValueType(0).getSimpleVT();
	MVT SVT = VT.getScalarType();

	MVT InVT = In.getValueType().getSimpleVT();
	MVT InSVT = InVT.getScalarType();

	// Use PACKSS if the input is a splatted sign bit.
	// e.g. Comparison result, sext_in_reg, etc.
	unsigned NumSignBits = DAG.ComputeNumSignBits(In);
	if (NumSignBits != InSVT.getSizeInBits())
	return SDValue();

	// Check we have a truncation suited for PACKSS.
	if (!VT.is128BitVector() && !VT.is256BitVector())
	return SDValue();
	if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
	return SDValue();
	if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
	return SDValue();

	return truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget);
	}

	static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	SDValue Src = N->getOperand(0);
	SDLoc DL(N);

	// Attempt to pre-truncate inputs to arithmetic ops instead.
	if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
	return V;

	// Try to detect AVG pattern first.
	if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
	return Avg;

	// Try to combine truncation with unsigned saturation.
	if (SDValue Val = combineTruncateWithUSat(Src, VT, DL, DAG, Subtarget))
	return Val;

	// The bitcast source is a direct mmx result.
	// Detect bitcasts between i32 to x86mmx
	if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
	SDValue BCSrc = Src.getOperand(0);
	if (BCSrc.getValueType() == MVT::x86mmx)
	return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
	}

	// Try to truncate extended sign bits with PACKSS.
	if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
	return V;

	return combineVectorTruncation(N, DAG, Subtarget);
	}

	/// Returns the negated value if the node \p N flips sign of FP value.
	///
	/// FP-negation node may have different forms: FNEG(x) or FXOR (x, 0x80000000).
	/// AVX512F does not have FXOR, so FNEG is lowered as
	/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
	/// In this case we go though all bitcasts.
	static SDValue isFNEG(SDNode *N) {
	if (N->getOpcode() == ISD::FNEG)
	return N->getOperand(0);

	SDValue Op = peekThroughBitcasts(SDValue(N, 0));
	if (Op.getOpcode() != X86ISD::FXOR && Op.getOpcode() != ISD::XOR)
	return SDValue();

	SDValue Op1 = peekThroughBitcasts(Op.getOperand(1));
	if (!Op1.getValueType().isFloatingPoint())
	return SDValue();

	SDValue Op0 = peekThroughBitcasts(Op.getOperand(0));

	unsigned EltBits = Op1.getScalarValueSizeInBits();
	auto isSignMask = [&](const ConstantFP *C) {
	return C->getValueAPF().bitcastToAPInt() == APInt::getSignMask(EltBits);
	};

	// There is more than one way to represent the same constant on
	// the different X86 targets. The type of the node may also depend on size.
	// - load scalar value and broadcast
	// - BUILD_VECTOR node
	// - load from a constant pool.
	// We check all variants here.
	if (Op1.getOpcode() == X86ISD::VBROADCAST) {
	if (auto *C = getTargetConstantFromNode(Op1.getOperand(0)))
	if (isSignMask(cast<ConstantFP>(C)))
	return Op0;

	} else if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1)) {
	if (ConstantFPSDNode *CN = BV->getConstantFPSplatNode())
	if (isSignMask(CN->getConstantFPValue()))
	return Op0;

	} else if (auto *C = getTargetConstantFromNode(Op1)) {
	if (C->getType()->isVectorTy()) {
	if (auto *SplatV = C->getSplatValue())
	if (isSignMask(cast<ConstantFP>(SplatV)))
	return Op0;
	} else if (auto *FPConst = dyn_cast<ConstantFP>(C))
	if (isSignMask(FPConst))
	return Op0;
	}
	return SDValue();
	}

	/// Do target-specific dag combines on floating point negations.
	static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT OrigVT = N->getValueType(0);
	SDValue Arg = isFNEG(N);
	assert(Arg.getNode() && "N is expected to be an FNEG node");

	EVT VT = Arg.getValueType();
	EVT SVT = VT.getScalarType();
	SDLoc DL(N);

	// Let legalize expand this if it isn't a legal type yet.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
	return SDValue();

	// If we're negating a FMUL node on a target with FMA, then we can avoid the
	// use of a constant by performing (-0 - A*B) instead.
	// FIXME: Check rounding control flags as well once it becomes available.
	if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 \|\| SVT == MVT::f64) &&
	Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
	SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
	SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
	Arg.getOperand(1), Zero);
	return DAG.getBitcast(OrigVT, NewNode);
	}

	// If we're negating an FMA node, then we can adjust the
	// instruction to include the extra negation.
	unsigned NewOpcode = 0;
	if (Arg.hasOneUse()) {
	switch (Arg.getOpcode()) {
	case X86ISD::FMADD: NewOpcode = X86ISD::FNMSUB; break;
	case X86ISD::FMSUB: NewOpcode = X86ISD::FNMADD; break;
	case X86ISD::FNMADD: NewOpcode = X86ISD::FMSUB; break;
	case X86ISD::FNMSUB: NewOpcode = X86ISD::FMADD; break;
	case X86ISD::FMADD_RND: NewOpcode = X86ISD::FNMSUB_RND; break;
	case X86ISD::FMSUB_RND: NewOpcode = X86ISD::FNMADD_RND; break;
	case X86ISD::FNMADD_RND: NewOpcode = X86ISD::FMSUB_RND; break;
	case X86ISD::FNMSUB_RND: NewOpcode = X86ISD::FMADD_RND; break;
	// We can't handle scalar intrinsic node here because it would only
	// invert one element and not the whole vector. But we could try to handle
	// a negation of the lower element only.
	}
	}
	if (NewOpcode)
	return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT,
	Arg.getNode()->ops()));

	return SDValue();
	}

	static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = N->getSimpleValueType(0);
	// If we have integer vector types available, use the integer opcodes.
	if (VT.isVector() && Subtarget.hasSSE2()) {
	SDLoc dl(N);

	MVT IntVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);

	SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
	SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
	unsigned IntOpcode;
	switch (N->getOpcode()) {
	default: llvm_unreachable("Unexpected FP logic op");
	case X86ISD::FOR: IntOpcode = ISD::OR; break;
	case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
	case X86ISD::FAND: IntOpcode = ISD::AND; break;
	case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
	}
	SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
	return DAG.getBitcast(VT, IntOp);
	}
	return SDValue();
	}

	static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
	return Cmp;

	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
	return RV;

	if (Subtarget.hasCMov())
	if (SDValue RV = combineIntegerAbs(N, DAG))
	return RV;

	if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
	return FPLogic;

	if (isFNEG(N))
	return combineFneg(N, DAG, Subtarget);
	return SDValue();
	}


	static bool isNullFPScalarOrVectorConst(SDValue V) {
	return isNullFPConstant(V) \|\| ISD::isBuildVectorAllZeros(V.getNode());
	}

	/// If a value is a scalar FP zero or a vector FP zero (potentially including
	/// undefined elements), return a zero constant that may be used to fold away
	/// that value. In the case of a vector, the returned constant will not contain
	/// undefined elements even if the input parameter does. This makes it suitable
	/// to be used as a replacement operand with operations (eg, bitwise-and) where
	/// an undef should not propagate.
	static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (!isNullFPScalarOrVectorConst(V))
	return SDValue();

	if (V.getValueType().isVector())
	return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));

	return V;
	}

	static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);
	SDLoc DL(N);

	// Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
	if (!((VT == MVT::f32 && Subtarget.hasSSE1()) \|\|
	(VT == MVT::f64 && Subtarget.hasSSE2())))
	return SDValue();

	auto isAllOnesConstantFP = [](SDValue V) {
	auto *C = dyn_cast<ConstantFPSDNode>(V);
	return C && C->getConstantFPValue()->isAllOnesValue();
	};

	// fand (fxor X, -1), Y --> fandn X, Y
	if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
	return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);

	// fand X, (fxor Y, -1) --> fandn Y, X
	if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
	return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);

	return SDValue();
	}

	/// Do target-specific dag combines on X86ISD::FAND nodes.
	static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// FAND(0.0, x) -> 0.0
	if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
	return V;

	// FAND(x, 0.0) -> 0.0
	if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
	return V;

	if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
	return V;

	return lowerX86FPLogicOp(N, DAG, Subtarget);
	}

	/// Do target-specific dag combines on X86ISD::FANDN nodes.
	static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// FANDN(0.0, x) -> x
	if (isNullFPScalarOrVectorConst(N->getOperand(0)))
	return N->getOperand(1);

	// FANDN(x, 0.0) -> 0.0
	if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
	return V;

	return lowerX86FPLogicOp(N, DAG, Subtarget);
	}

	/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
	static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert(N->getOpcode() == X86ISD::FOR \|\| N->getOpcode() == X86ISD::FXOR);

	// F[X]OR(0.0, x) -> x
	if (isNullFPScalarOrVectorConst(N->getOperand(0)))
	return N->getOperand(1);

	// F[X]OR(x, 0.0) -> x
	if (isNullFPScalarOrVectorConst(N->getOperand(1)))
	return N->getOperand(0);

	if (isFNEG(N))
	if (SDValue NewVal = combineFneg(N, DAG, Subtarget))
	return NewVal;

	return lowerX86FPLogicOp(N, DAG, Subtarget);
	}

	/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
	static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
	assert(N->getOpcode() == X86ISD::FMIN \|\| N->getOpcode() == X86ISD::FMAX);

	// Only perform optimizations if UnsafeMath is used.
	if (!DAG.getTarget().Options.UnsafeFPMath)
	return SDValue();

	// If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
	// into FMINC and FMAXC, which are Commutative operations.
	unsigned NewOp = 0;
	switch (N->getOpcode()) {
	default: llvm_unreachable("unknown opcode");
	case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
	case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
	}

	return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
	N->getOperand(0), N->getOperand(1));
	}

	static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (Subtarget.useSoftFloat())
	return SDValue();

	// TODO: Check for global or instruction-level "nnan". In that case, we
	// should be able to lower to FMAX/FMIN alone.
	// TODO: If an operand is already known to be a NaN or not a NaN, this
	// should be an optional swap and FMAX/FMIN.

	EVT VT = N->getValueType(0);
	if (!((Subtarget.hasSSE1() && (VT == MVT::f32 \|\| VT == MVT::v4f32)) \|\|
	(Subtarget.hasSSE2() && (VT == MVT::f64 \|\| VT == MVT::v2f64)) \|\|
	(Subtarget.hasAVX() && (VT == MVT::v8f32 \|\| VT == MVT::v4f64))))
	return SDValue();

	// This takes at least 3 instructions, so favor a library call when operating
	// on a scalar and minimizing code size.
	if (!VT.isVector() && DAG.getMachineFunction().getFunction()->optForMinSize())
	return SDValue();

	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);
	SDLoc DL(N);
	EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
	DAG.getDataLayout(), *DAG.getContext(), VT);

	// There are 4 possibilities involving NaN inputs, and these are the required
	// outputs:
	// Op1
	// Num NaN
	// ----------------
	// Num \| Max \| Op0 \|
	// Op0 ----------------
	// NaN \| Op1 \| NaN \|
	// ----------------
	//
	// The SSE FP max/min instructions were not designed for this case, but rather
	// to implement:
	// Min = Op1 < Op0 ? Op1 : Op0
	// Max = Op1 > Op0 ? Op1 : Op0
	//
	// So they always return Op0 if either input is a NaN. However, we can still
	// use those instructions for fmaxnum by selecting away a NaN input.

	// If either operand is NaN, the 2nd source operand (Op0) is passed through.
	auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
	SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
	SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO);

	// If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
	// are NaN, the NaN value of Op1 is the result.
	return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
	}

	/// Do target-specific dag combines on X86ISD::ANDNP nodes.
	static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	// ANDNP(0, x) -> x
	if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
	return N->getOperand(1);

	// ANDNP(x, 0) -> 0
	if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
	return getZeroVector(N->getSimpleValueType(0), Subtarget, DAG, SDLoc(N));

	EVT VT = N->getValueType(0);

	// Attempt to recursively combine a bitmask ANDNP with shuffles.
	if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
	SDValue Op(N, 0);
	SmallVector<int, 1> NonceMask; // Just a placeholder.
	NonceMask.push_back(0);
	if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
	/Depth/ 1, /HasVarMask/ false, DAG,
	DCI, Subtarget))
	return SDValue(); // This routine will use CombineTo to replace N.
	}

	return SDValue();
	}

	static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	// BT ignores high bits in the bit index operand.
	SDValue Op1 = N->getOperand(1);
	if (Op1.hasOneUse()) {
	unsigned BitWidth = Op1.getValueSizeInBits();
	APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
	KnownBits Known;
	TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
	!DCI.isBeforeLegalizeOps());
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (TLI.ShrinkDemandedConstant(Op1, DemandedMask, TLO) \|\|
	TLI.SimplifyDemandedBits(Op1, DemandedMask, Known, TLO))
	DCI.CommitTargetLoweringOpt(TLO);
	}
	return SDValue();
	}

	static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	if (!VT.isVector())
	return SDValue();

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
	SDLoc dl(N);

	// The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
	// both SSE and AVX2 since there is no sign-extended shift right
	// operation on a vector with 64-bit elements.
	//(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
	// (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
	if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND \|\|
	N0.getOpcode() == ISD::SIGN_EXTEND)) {
	SDValue N00 = N0.getOperand(0);

	// EXTLOAD has a better solution on AVX2,
	// it may be replaced with X86ISD::VSEXT node.
	if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
	if (!ISD::isNormalLoad(N00.getNode()))
	return SDValue();

	if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
	SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
	N00, N1);
	return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
	}
	}
	return SDValue();
	}

	/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
	/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
	/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
	/// opportunities to combine math ops, use an LEA, or use a complex addressing
	/// mode. This can eliminate extend, add, and shift instructions.
	static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
	Ext->getOpcode() != ISD::ZERO_EXTEND)
	return SDValue();

	// TODO: This should be valid for other integer types.
	EVT VT = Ext->getValueType(0);
	if (VT != MVT::i64)
	return SDValue();

	SDValue Add = Ext->getOperand(0);
	if (Add.getOpcode() != ISD::ADD)
	return SDValue();

	bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
	bool NSW = Add->getFlags().hasNoSignedWrap();
	bool NUW = Add->getFlags().hasNoUnsignedWrap();

	// We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
	// into the 'zext'
	if ((Sext && !NSW) \|\| (!Sext && !NUW))
	return SDValue();

	// Having a constant operand to the 'add' ensures that we are not increasing
	// the instruction count because the constant is extended for free below.
	// A constant operand can also become the displacement field of an LEA.
	auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
	if (!AddOp1)
	return SDValue();

	// Don't make the 'add' bigger if there's no hope of combining it with some
	// other 'add' or 'shl' instruction.
	// TODO: It may be profitable to generate simpler LEA instructions in place
	// of single 'add' instructions, but the cost model for selecting an LEA
	// currently has a high threshold.
	bool HasLEAPotential = false;
	for (auto *User : Ext->uses()) {
	if (User->getOpcode() == ISD::ADD \|\| User->getOpcode() == ISD::SHL) {
	HasLEAPotential = true;
	break;
	}
	}
	if (!HasLEAPotential)
	return SDValue();

	// Everything looks good, so pull the '{s\|z}ext' ahead of the 'add'.
	int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
	SDValue AddOp0 = Add.getOperand(0);
	SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
	SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);

	// The wider add is guaranteed to not wrap because both operands are
	// sign-extended.
	SDNodeFlags Flags;
	Flags.setNoSignedWrap(NSW);
	Flags.setNoUnsignedWrap(NUW);
	return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
	}

	/// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) ->
	/// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y)
	/// This exposes the {s/z}ext to the sdivrem lowering, so that it directly
	/// extends from AH (which we otherwise need to do contortions to access).
	static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) {
	SDValue N0 = N->getOperand(0);
	auto OpcodeN = N->getOpcode();
	auto OpcodeN0 = N0.getOpcode();
	if (!((OpcodeN == ISD::SIGN_EXTEND && OpcodeN0 == ISD::SDIVREM) \|\|
	(OpcodeN == ISD::ZERO_EXTEND && OpcodeN0 == ISD::UDIVREM)))
	return SDValue();

	EVT VT = N->getValueType(0);
	EVT InVT = N0.getValueType();
	if (N0.getResNo() != 1 \|\| InVT != MVT::i8 \|\| VT != MVT::i32)
	return SDValue();

	SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
	auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG
	: X86ISD::UDIVREM8_ZEXT_HREG;
	SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0),
	N0.getOperand(1));
	DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
	return R.getValue(1);
	}

	/// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or
	/// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating
	/// with UNDEFs) of the input to vectors of the same size as the target type
	/// which then extends the lowest elements.
	static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	unsigned Opcode = N->getOpcode();
	if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
	return SDValue();
	if (!DCI.isBeforeLegalizeOps())
	return SDValue();
	if (!Subtarget.hasSSE2())
	return SDValue();

	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT SVT = VT.getScalarType();
	EVT InVT = N0.getValueType();
	EVT InSVT = InVT.getScalarType();

	// Input type must be a vector and we must be extending legal integer types.
	if (!VT.isVector())
	return SDValue();
	if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
	return SDValue();
	if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
	return SDValue();

	// On AVX2+ targets, if the input/output types are both legal then we will be
	// able to use SIGN_EXTEND/ZERO_EXTEND directly.
	if (Subtarget.hasInt256() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
	DAG.getTargetLoweringInfo().isTypeLegal(InVT))
	return SDValue();

	SDLoc DL(N);

	auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
	EVT InVT = N.getValueType();
	EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
	Size / InVT.getScalarSizeInBits());
	SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
	DAG.getUNDEF(InVT));
	Opnds[0] = N;
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
	};

	// If target-size is less than 128-bits, extend to a type that would extend
	// to 128 bits, extend that and extract the original target vector.
	if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) {
	unsigned Scale = 128 / VT.getSizeInBits();
	EVT ExVT =
	EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
	SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
	SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
	DAG.getIntPtrConstant(0, DL));
	}

	// If target-size is 128-bits (or 256-bits on AVX2 target), then convert to
	// ISD::_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::VEXT.
	// Also use this if we don't have SSE41 to allow the legalizer do its job.
	if (!Subtarget.hasSSE41() \|\| VT.is128BitVector() \|\|
	(VT.is256BitVector() && Subtarget.hasInt256()) \|\|
	(VT.is512BitVector() && Subtarget.hasAVX512())) {
	SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
	return Opcode == ISD::SIGN_EXTEND
	? DAG.getSignExtendVectorInReg(ExOp, DL, VT)
	: DAG.getZeroExtendVectorInReg(ExOp, DL, VT);
	}

	auto SplitAndExtendInReg = [&](unsigned SplitSize) {
	unsigned NumVecs = VT.getSizeInBits() / SplitSize;
	unsigned NumSubElts = SplitSize / SVT.getSizeInBits();
	EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
	EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);

	SmallVector<SDValue, 8> Opnds;
	for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
	SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
	DAG.getIntPtrConstant(Offset, DL));
	SrcVec = ExtendVecSize(DL, SrcVec, SplitSize);
	SrcVec = Opcode == ISD::SIGN_EXTEND
	? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT)
	: DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT);
	Opnds.push_back(SrcVec);
	}
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
	};

	// On pre-AVX2 targets, split into 128-bit nodes of
	// ISD::*_EXTEND_VECTOR_INREG.
	if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128))
	return SplitAndExtendInReg(128);

	// On pre-AVX512 targets, split into 256-bit nodes of
	// ISD::*_EXTEND_VECTOR_INREG.
	if (!Subtarget.hasAVX512() && !(VT.getSizeInBits() % 256))
	return SplitAndExtendInReg(256);

	return SDValue();
	}

	static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT InVT = N0.getValueType();
	SDLoc DL(N);

	if (SDValue DivRem8 = getDivRem8(N, DAG))
	return DivRem8;

	if (!DCI.isBeforeLegalizeOps()) {
	if (InVT == MVT::i1) {
	SDValue Zero = DAG.getConstant(0, DL, VT);
	SDValue AllOnes = DAG.getAllOnesConstant(DL, VT);
	return DAG.getSelect(DL, VT, N0, AllOnes, Zero);
	}
	return SDValue();
	}

	if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR &&
	isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) {
	// Invert and sign-extend a boolean is the same as zero-extend and subtract
	// 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently
	// lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1.
	// sext (xor Bool, -1) --> sub (zext Bool), 1
	SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
	return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT));
	}

	if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
	return V;

	if (Subtarget.hasAVX() && VT.is256BitVector())
	if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
	return R;

	if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
	return NewAdd;

	return SDValue();
	}

	static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc dl(N);
	EVT VT = N->getValueType(0);

	// Let legalize expand this if it isn't a legal type yet.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
	return SDValue();

	EVT ScalarVT = VT.getScalarType();
	if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) \|\| !Subtarget.hasAnyFMA())
	return SDValue();

	SDValue A = N->getOperand(0);
	SDValue B = N->getOperand(1);
	SDValue C = N->getOperand(2);

	auto invertIfNegative = [](SDValue &V) {
	if (SDValue NegVal = isFNEG(V.getNode())) {
	V = NegVal;
	return true;
	}
	return false;
	};

	// Do not convert the passthru input of scalar intrinsics.
	// FIXME: We could allow negations of the lower element only.
	bool NegA = N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A);
	bool NegB = invertIfNegative(B);
	bool NegC = N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C);

	// Negative multiplication when NegA xor NegB
	bool NegMul = (NegA != NegB);

	unsigned NewOpcode;
	if (!NegMul)
	NewOpcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
	else
	NewOpcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;


	if (N->getOpcode() == X86ISD::FMADD_RND) {
	switch (NewOpcode) {
	case X86ISD::FMADD: NewOpcode = X86ISD::FMADD_RND; break;
	case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB_RND; break;
	case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD_RND; break;
	case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB_RND; break;
	}
	} else if (N->getOpcode() == X86ISD::FMADDS1_RND) {
	switch (NewOpcode) {
	case X86ISD::FMADD: NewOpcode = X86ISD::FMADDS1_RND; break;
	case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1_RND; break;
	case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1_RND; break;
	case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1_RND; break;
	}
	} else if (N->getOpcode() == X86ISD::FMADDS3_RND) {
	switch (NewOpcode) {
	case X86ISD::FMADD: NewOpcode = X86ISD::FMADDS3_RND; break;
	case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3_RND; break;
	case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3_RND; break;
	case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3_RND; break;
	}
	} else {
	assert((N->getOpcode() == X86ISD::FMADD \|\| N->getOpcode() == ISD::FMA) &&
	"Unexpected opcode!");
	return DAG.getNode(NewOpcode, dl, VT, A, B, C);
	}

	return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
	}

	static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	// (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->
	// (and (i32 x86isd::setcc_carry), 1)
	// This eliminates the zext. This transformation is necessary because
	// ISD::SETCC is always legalized to i8.
	SDLoc dl(N);
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	if (N0.getOpcode() == ISD::AND &&
	N0.hasOneUse() &&
	N0.getOperand(0).hasOneUse()) {
	SDValue N00 = N0.getOperand(0);
	if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
	if (!isOneConstant(N0.getOperand(1)))
	return SDValue();
	return DAG.getNode(ISD::AND, dl, VT,
	DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
	N00.getOperand(0), N00.getOperand(1)),
	DAG.getConstant(1, dl, VT));
	}
	}

	if (N0.getOpcode() == ISD::TRUNCATE &&
	N0.hasOneUse() &&
	N0.getOperand(0).hasOneUse()) {
	SDValue N00 = N0.getOperand(0);
	if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
	return DAG.getNode(ISD::AND, dl, VT,
	DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
	N00.getOperand(0), N00.getOperand(1)),
	DAG.getConstant(1, dl, VT));
	}
	}

	if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
	return V;

	if (VT.is256BitVector())
	if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
	return R;

	if (SDValue DivRem8 = getDivRem8(N, DAG))
	return DivRem8;

	if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
	return NewAdd;

	if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
	return R;

	return SDValue();
	}

	/// Try to map a 128-bit or larger integer comparison to vector instructions
	/// before type legalization splits it up into chunks.
	static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
	assert((CC == ISD::SETNE \|\| CC == ISD::SETEQ) && "Bad comparison predicate");

	// We're looking for an oversized integer equality comparison, but ignore a
	// comparison with zero because that gets special treatment in EmitTest().
	SDValue X = SetCC->getOperand(0);
	SDValue Y = SetCC->getOperand(1);
	EVT OpVT = X.getValueType();
	unsigned OpSize = OpVT.getSizeInBits();
	if (!OpVT.isScalarInteger() \|\| OpSize < 128 \|\| isNullConstant(Y))
	return SDValue();

	// TODO: Use PXOR + PTEST for SSE4.1 or later?
	// TODO: Add support for AVX-512.
	EVT VT = SetCC->getValueType(0);
	SDLoc DL(SetCC);
	if ((OpSize == 128 && Subtarget.hasSSE2()) \|\|
	(OpSize == 256 && Subtarget.hasAVX2())) {
	EVT VecVT = OpSize == 128 ? MVT::v16i8 : MVT::v32i8;
	SDValue VecX = DAG.getBitcast(VecVT, X);
	SDValue VecY = DAG.getBitcast(VecVT, Y);

	// If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
	// setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
	// setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
	// setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq
	// setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne
	SDValue Cmp = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, VecX, VecY);
	SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
	SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL,
	MVT::i32);
	return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
	}

	return SDValue();
	}

	static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);
	EVT VT = N->getValueType(0);
	SDLoc DL(N);

	if (CC == ISD::SETNE \|\| CC == ISD::SETEQ) {
	EVT OpVT = LHS.getValueType();
	// 0-x == y --> x+y == 0
	// 0-x != y --> x+y != 0
	if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
	LHS.hasOneUse()) {
	SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, RHS, LHS.getOperand(1));
	return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
	}
	// x == 0-y --> x+y == 0
	// x != 0-y --> x+y != 0
	if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
	RHS.hasOneUse()) {
	SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
	return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
	}

	if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
	return V;
	}

	if (VT.getScalarType() == MVT::i1 &&
	(CC == ISD::SETNE \|\| CC == ISD::SETEQ \|\| ISD::isSignedIntSetCC(CC))) {
	bool IsSEXT0 =
	(LHS.getOpcode() == ISD::SIGN_EXTEND) &&
	(LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
	bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());

	if (!IsSEXT0 \|\| !IsVZero1) {
	// Swap the operands and update the condition code.
	std::swap(LHS, RHS);
	CC = ISD::getSetCCSwappedOperands(CC);

	IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
	(LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
	IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
	}

	if (IsSEXT0 && IsVZero1) {
	assert(VT == LHS.getOperand(0).getValueType() &&
	"Uexpected operand type");
	if (CC == ISD::SETGT)
	return DAG.getConstant(0, DL, VT);
	if (CC == ISD::SETLE)
	return DAG.getConstant(1, DL, VT);
	if (CC == ISD::SETEQ \|\| CC == ISD::SETGE)
	return DAG.getNOT(DL, LHS.getOperand(0), VT);

	assert((CC == ISD::SETNE \|\| CC == ISD::SETLT) &&
	"Unexpected condition code!");
	return LHS.getOperand(0);
	}
	}

	// For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
	// to avoid scalarization via legalization because v4i32 is not a legal type.
	if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
	LHS.getValueType() == MVT::v4f32)
	return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);

	return SDValue();
	}

	static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG) {
	SDLoc DL(N);
	// Gather and Scatter instructions use k-registers for masks. The type of
	// the masks is v*i1. So the mask will be truncated anyway.
	// The SIGN_EXTEND_INREG my be dropped.
	SDValue Mask = N->getOperand(2);
	if (Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) {
	SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
	NewOps[2] = Mask.getOperand(0);
	DAG.UpdateNodeOperands(N, NewOps);
	}
	return SDValue();
	}

	// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
	static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc DL(N);
	X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
	SDValue EFLAGS = N->getOperand(1);

	// Try to simplify the EFLAGS and condition code operands.
	if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG))
	return getSETCC(CC, Flags, DL, DAG);

	return SDValue();
	}

	/// Optimize branch condition evaluation.
	static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc DL(N);
	SDValue EFLAGS = N->getOperand(3);
	X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));

	// Try to simplify the EFLAGS and condition code operands.
	// Make sure to not keep references to operands, as combineSetCCEFLAGS can
	// RAUW them under us.
	if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) {
	SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
	return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
	N->getOperand(1), Cond, Flags);
	}

	return SDValue();
	}

	static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
	SelectionDAG &DAG) {
	// Take advantage of vector comparisons producing 0 or -1 in each lane to
	// optimize away operation when it's from a constant.
	//
	// The general transformation is:
	// UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
	// AND(VECTOR_CMP(x,y), constant2)
	// constant2 = UNARYOP(constant)

	// Early exit if this isn't a vector operation, the operand of the
	// unary operation isn't a bitwise AND, or if the sizes of the operations
	// aren't the same.
	EVT VT = N->getValueType(0);
	if (!VT.isVector() \|\| N->getOperand(0)->getOpcode() != ISD::AND \|\|
	N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC \|\|
	VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
	return SDValue();

	// Now check that the other operand of the AND is a constant. We could
	// make the transformation for non-constant splats as well, but it's unclear
	// that would be a benefit as it would not eliminate any operations, just
	// perform one more step in scalar code before moving to the vector unit.
	if (BuildVectorSDNode *BV =
	dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
	// Bail out if the vector isn't a constant.
	if (!BV->isConstant())
	return SDValue();

	// Everything checks out. Build up the new and improved node.
	SDLoc DL(N);
	EVT IntVT = BV->getValueType(0);
	// Create a new constant of the appropriate type for the transformed
	// DAG.
	SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
	// The AND node needs bitcasts to/from an integer vector type around it.
	SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
	SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
	N->getOperand(0)->getOperand(0), MaskConst);
	SDValue Res = DAG.getBitcast(VT, NewAnd);
	return Res;
	}

	return SDValue();
	}

	static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue Op0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT InVT = Op0.getValueType();
	EVT InSVT = InVT.getScalarType();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
	// UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
	if (InVT.isVector() && (InSVT == MVT::i8 \|\| InSVT == MVT::i16)) {
	SDLoc dl(N);
	EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
	InVT.getVectorNumElements());
	SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);

	if (TLI.isOperationLegal(ISD::UINT_TO_FP, DstVT))
	return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P);

	return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
	}

	// Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
	// optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
	// the optimization here.
	if (DAG.SignBitIsZero(Op0))
	return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);

	return SDValue();
	}

	static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// First try to optimize away the conversion entirely when it's
	// conditionally from a constant. Vectors only.
	if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
	return Res;

	// Now move on to more general possibilities.
	SDValue Op0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT InVT = Op0.getValueType();
	EVT InSVT = InVT.getScalarType();

	// SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
	// SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
	// SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
	if (InVT.isVector() &&
	(InSVT == MVT::i8 \|\| InSVT == MVT::i16 \|\|
	(InSVT == MVT::i1 && !DAG.getTargetLoweringInfo().isTypeLegal(InVT)))) {
	SDLoc dl(N);
	EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
	InVT.getVectorNumElements());
	SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
	return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
	}

	// Without AVX512DQ we only support i64 to float scalar conversion. For both
	// vectors and scalars, see if we know that the upper bits are all the sign
	// bit, in which case we can truncate the input to i32 and convert from that.
	if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
	unsigned BitWidth = InVT.getScalarSizeInBits();
	unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
	if (NumSignBits >= (BitWidth - 31)) {
	EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), 32);
	if (InVT.isVector())
	TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
	InVT.getVectorNumElements());
	SDLoc dl(N);
	SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
	return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
	}
	}

	// Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
	// a 32-bit target where SSE doesn't support i64->FP operations.
	if (!Subtarget.useSoftFloat() && Op0.getOpcode() == ISD::LOAD) {
	LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
	EVT LdVT = Ld->getValueType(0);

	// This transformation is not supported if the result type is f16 or f128.
	if (VT == MVT::f16 \|\| VT == MVT::f128)
	return SDValue();

	if (!Ld->isVolatile() && !VT.isVector() &&
	ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
	!Subtarget.is64Bit() && LdVT == MVT::i64) {
	SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
	SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
	DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
	return FILDChain;
	}
	}
	return SDValue();
	}

	// Optimize RES, EFLAGS = X86ISD::ADD LHS, RHS
	static SDValue combineX86ADD(SDNode *N, SelectionDAG &DAG,
	X86TargetLowering::DAGCombinerInfo &DCI) {
	// When legalizing carry, we create carries via add X, -1
	// If that comes from an actual carry, via setcc, we use the
	// carry directly.
	if (isAllOnesConstant(N->getOperand(1)) && N->hasAnyUseOfValue(1)) {
	SDValue Carry = N->getOperand(0);
	while (Carry.getOpcode() == ISD::TRUNCATE \|\|
	Carry.getOpcode() == ISD::ZERO_EXTEND \|\|
	Carry.getOpcode() == ISD::SIGN_EXTEND \|\|
	Carry.getOpcode() == ISD::ANY_EXTEND \|\|
	(Carry.getOpcode() == ISD::AND &&
	isOneConstant(Carry.getOperand(1))))
	Carry = Carry.getOperand(0);

	if (Carry.getOpcode() == X86ISD::SETCC \|\|
	Carry.getOpcode() == X86ISD::SETCC_CARRY) {
	if (Carry.getConstantOperandVal(0) == X86::COND_B)
	return DCI.CombineTo(N, SDValue(N, 0), Carry.getOperand(1));
	}
	}

	return SDValue();
	}

	// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
	static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
	X86TargetLowering::DAGCombinerInfo &DCI) {
	// If the LHS and RHS of the ADC node are zero, then it can't overflow and
	// the result is either zero or one (depending on the input carry bit).
	// Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
	if (X86::isZeroNode(N->getOperand(0)) &&
	X86::isZeroNode(N->getOperand(1)) &&
	// We don't have a good way to replace an EFLAGS use, so only do this when
	// dead right now.
	SDValue(N, 1).use_empty()) {
	SDLoc DL(N);
	EVT VT = N->getValueType(0);
	SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
	SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
	DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
	DAG.getConstant(X86::COND_B, DL,
	MVT::i8),
	N->getOperand(2)),
	DAG.getConstant(1, DL, VT));
	return DCI.CombineTo(N, Res1, CarryOut);
	}

	return SDValue();
	}

	/// Materialize "setb reg" as "sbb reg,reg", since it produces an all-ones bit
	/// which is more useful than 0/1 in some cases.
	static SDValue materializeSBB(SDNode *N, SDValue EFLAGS, SelectionDAG &DAG) {
	SDLoc DL(N);
	// "Condition code B" is also known as "the carry flag" (CF).
	SDValue CF = DAG.getConstant(X86::COND_B, DL, MVT::i8);
	SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, CF, EFLAGS);
	MVT VT = N->getSimpleValueType(0);
	if (VT == MVT::i8)
	return DAG.getNode(ISD::AND, DL, VT, SBB, DAG.getConstant(1, DL, VT));

	assert(VT == MVT::i1 && "Unexpected type for SETCC node");
	return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SBB);
	}

	/// If this is an add or subtract where one operand is produced by a cmp+setcc,
	/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
	/// with CMP+{ADC, SBB}.
	static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
	bool IsSub = N->getOpcode() == ISD::SUB;
	SDValue X = N->getOperand(0);
	SDValue Y = N->getOperand(1);

	// If this is an add, canonicalize a zext operand to the RHS.
	// TODO: Incomplete? What if both sides are zexts?
	if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
	Y.getOpcode() != ISD::ZERO_EXTEND)
	std::swap(X, Y);

	// Look through a one-use zext.
	bool PeekedThroughZext = false;
	if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
	Y = Y.getOperand(0);
	PeekedThroughZext = true;
	}

	// If this is an add, canonicalize a setcc operand to the RHS.
	// TODO: Incomplete? What if both sides are setcc?
	// TODO: Should we allow peeking through a zext of the other operand?
	if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
	Y.getOpcode() != X86ISD::SETCC)
	std::swap(X, Y);

	if (Y.getOpcode() != X86ISD::SETCC \|\| !Y.hasOneUse())
	return SDValue();

	SDLoc DL(N);
	EVT VT = N->getValueType(0);
	X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);

	// If X is -1 or 0, then we have an opportunity to avoid constants required in
	// the general case below.
	auto *ConstantX = dyn_cast<ConstantSDNode>(X);
	if (ConstantX) {
	if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnesValue()) \|\|
	(IsSub && CC == X86::COND_B && ConstantX->isNullValue())) {
	// This is a complicated way to get -1 or 0 from the carry flag:
	// -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
	// 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
	return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
	DAG.getConstant(X86::COND_B, DL, MVT::i8),
	Y.getOperand(1));
	}

	if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnesValue()) \|\|
	(IsSub && CC == X86::COND_A && ConstantX->isNullValue())) {
	SDValue EFLAGS = Y->getOperand(1);
	if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
	EFLAGS.getValueType().isInteger() &&
	!isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
	// Swap the operands of a SUB, and we have the same pattern as above.
	// -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
	// 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
	SDValue NewSub = DAG.getNode(
	X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
	EFLAGS.getOperand(1), EFLAGS.getOperand(0));
	SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
	return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
	DAG.getConstant(X86::COND_B, DL, MVT::i8),
	NewEFLAGS);
	}
	}
	}

	if (CC == X86::COND_B) {
	// X + SETB Z --> X + (mask SBB Z, Z)
	// X - SETB Z --> X - (mask SBB Z, Z)
	// TODO: Produce ADC/SBB here directly and avoid SETCC_CARRY?
	SDValue SBB = materializeSBB(Y.getNode(), Y.getOperand(1), DAG);
	if (SBB.getValueSizeInBits() != VT.getSizeInBits())
	SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
	return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
	}

	if (CC == X86::COND_A) {
	SDValue EFLAGS = Y->getOperand(1);
	// Try to convert COND_A into COND_B in an attempt to facilitate
	// materializing "setb reg".
	//
	// Do not flip "e > c", where "c" is a constant, because Cmp instruction
	// cannot take an immediate as its first operand.
	//
	if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
	EFLAGS.getValueType().isInteger() &&
	!isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
	SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
	EFLAGS.getNode()->getVTList(),
	EFLAGS.getOperand(1), EFLAGS.getOperand(0));
	SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
	SDValue SBB = materializeSBB(Y.getNode(), NewEFLAGS, DAG);
	if (SBB.getValueSizeInBits() != VT.getSizeInBits())
	SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
	return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
	}
	}

	if (CC != X86::COND_E && CC != X86::COND_NE)
	return SDValue();

	SDValue Cmp = Y.getOperand(1);
	if (Cmp.getOpcode() != X86ISD::CMP \|\| !Cmp.hasOneUse() \|\|
	!X86::isZeroNode(Cmp.getOperand(1)) \|\|
	!Cmp.getOperand(0).getValueType().isInteger())
	return SDValue();

	SDValue Z = Cmp.getOperand(0);
	EVT ZVT = Z.getValueType();

	// If X is -1 or 0, then we have an opportunity to avoid constants required in
	// the general case below.
	if (ConstantX) {
	// 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
	// fake operands:
	// 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
	// -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
	if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) \|\|
	(!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) {
	SDValue Zero = DAG.getConstant(0, DL, ZVT);
	SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
	SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
	return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
	DAG.getConstant(X86::COND_B, DL, MVT::i8),
	SDValue(Neg.getNode(), 1));
	}

	// cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
	// with fake operands:
	// 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
	// -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
	if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) \|\|
	(!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {
	SDValue One = DAG.getConstant(1, DL, ZVT);
	SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
	return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
	DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp1);
	}
	}

	// (cmp Z, 1) sets the carry flag if Z is 0.
	SDValue One = DAG.getConstant(1, DL, ZVT);
	SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);

	// Add the flags type for ADC/SBB nodes.
	SDVTList VTs = DAG.getVTList(VT, MVT::i32);

	// X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
	// X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
	if (CC == X86::COND_NE)
	return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
	DAG.getConstant(-1ULL, DL, VT), Cmp1);

	// X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
	// X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
	return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
	DAG.getConstant(0, DL, VT), Cmp1);
	}

	static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue MulOp = N->getOperand(0);
	SDValue Phi = N->getOperand(1);

	if (MulOp.getOpcode() != ISD::MUL)
	std::swap(MulOp, Phi);
	if (MulOp.getOpcode() != ISD::MUL)
	return SDValue();

	ShrinkMode Mode;
	if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode) \|\| Mode == MULU16)
	return SDValue();

	EVT VT = N->getValueType(0);

	unsigned RegSize = 128;
	if (Subtarget.hasBWI())
	RegSize = 512;
	else if (Subtarget.hasAVX2())
	RegSize = 256;
	unsigned VectorSize = VT.getVectorNumElements() * 16;
	// If the vector size is less than 128, or greater than the supported RegSize,
	// do not use PMADD.
	if (VectorSize < 128 \|\| VectorSize > RegSize)
	return SDValue();

	SDLoc DL(N);
	EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
	VT.getVectorNumElements());
	EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
	VT.getVectorNumElements() / 2);

	// Shrink the operands of mul.
	SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0));
	SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1));

	// Madd vector size is half of the original vector size
	SDValue Madd = DAG.getNode(X86ISD::VPMADDWD, DL, MAddVT, N0, N1);
	// Fill the rest of the output with 0
	SDValue Zero = getZeroVector(Madd.getSimpleValueType(), Subtarget, DAG, DL);
	SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);
	return DAG.getNode(ISD::ADD, DL, VT, Concat, Phi);
	}

	static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc DL(N);
	EVT VT = N->getValueType(0);
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);

	// TODO: There's nothing special about i32, any integer type above i16 should
	// work just as well.
	if (!VT.isVector() \|\| !VT.isSimple() \|\|
	!(VT.getVectorElementType() == MVT::i32))
	return SDValue();

	unsigned RegSize = 128;
	if (Subtarget.hasBWI())
	RegSize = 512;
	else if (Subtarget.hasAVX2())
	RegSize = 256;

	// We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
	// TODO: We should be able to handle larger vectors by splitting them before
	// feeding them into several SADs, and then reducing over those.
	if (VT.getSizeInBits() / 4 > RegSize)
	return SDValue();

	// We know N is a reduction add, which means one of its operands is a phi.
	// To match SAD, we need the other operand to be a vector select.
	SDValue SelectOp, Phi;
	if (Op0.getOpcode() == ISD::VSELECT) {
	SelectOp = Op0;
	Phi = Op1;
	} else if (Op1.getOpcode() == ISD::VSELECT) {
	SelectOp = Op1;
	Phi = Op0;
	} else
	return SDValue();

	// Check whether we have an abs-diff pattern feeding into the select.
	if(!detectZextAbsDiff(SelectOp, Op0, Op1))
	return SDValue();

	// SAD pattern detected. Now build a SAD instruction and an addition for
	// reduction. Note that the number of elements of the result of SAD is less
	// than the number of elements of its input. Therefore, we could only update
	// part of elements in the reduction vector.
	SDValue Sad = createPSADBW(DAG, Op0, Op1, DL);

	// The output of PSADBW is a vector of i64.
	// We need to turn the vector of i64 into a vector of i32.
	// If the reduction vector is at least as wide as the psadbw result, just
	// bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
	// anyway.
	MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
	if (VT.getSizeInBits() >= ResVT.getSizeInBits())
	Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
	else
	Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);

	if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
	// Update part of elements of the reduction vector. This is done by first
	// extracting a sub-vector from it, updating this sub-vector, and inserting
	// it back.
	SDValue SubPhi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Phi,
	DAG.getIntPtrConstant(0, DL));
	SDValue Res = DAG.getNode(ISD::ADD, DL, ResVT, Sad, SubPhi);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Phi, Res,
	DAG.getIntPtrConstant(0, DL));
	} else
	return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
	}

	/// Convert vector increment or decrement to sub/add with an all-ones constant:
	/// add X, <1, 1...> --> sub X, <-1, -1...>
	/// sub X, <1, 1...> --> add X, <-1, -1...>
	/// The all-ones vector constant can be materialized using a pcmpeq instruction
	/// that is commonly recognized as an idiom (has no register dependency), so
	/// that's better/smaller than loading a splat 1 constant.
	static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) {
	assert((N->getOpcode() == ISD::ADD \|\| N->getOpcode() == ISD::SUB) &&
	"Unexpected opcode for increment/decrement transform");

	// Pseudo-legality check: getOnesVector() expects one of these types, so bail
	// out and wait for legalization if we have an unsupported vector length.
	EVT VT = N->getValueType(0);
	if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
	return SDValue();

	SDNode *N1 = N->getOperand(1).getNode();
	APInt SplatVal;
	if (!ISD::isConstantSplatVector(N1, SplatVal) \|\| !SplatVal.isOneValue())
	return SDValue();

	SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N));
	unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
	return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec);
	}

	static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	const SDNodeFlags Flags = N->getFlags();
	if (Flags.hasVectorReduction()) {
	if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
	return Sad;
	if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
	return MAdd;
	}
	EVT VT = N->getValueType(0);
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);

	// Try to synthesize horizontal adds from adds of shuffles.
	if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 \|\| VT == MVT::v4i32)) \|\|
	(Subtarget.hasInt256() && (VT == MVT::v16i16 \|\| VT == MVT::v8i32))) &&
	isHorizontalBinOp(Op0, Op1, true))
	return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);

	if (SDValue V = combineIncDecVector(N, DAG))
	return V;

	return combineAddOrSubToADCOrSBB(N, DAG);
	}

	static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);

	// X86 can't encode an immediate LHS of a sub. See if we can push the
	// negation into a preceding instruction.
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
	// If the RHS of the sub is a XOR with one use and a constant, invert the
	// immediate. Then add one to the LHS of the sub so we can turn
	// X-Y -> X+~Y+1, saving one register.
	if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
	isa<ConstantSDNode>(Op1.getOperand(1))) {
	APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
	EVT VT = Op0.getValueType();
	SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
	Op1.getOperand(0),
	DAG.getConstant(~XorC, SDLoc(Op1), VT));
	return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
	DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
	}
	}

	// Try to synthesize horizontal subs from subs of shuffles.
	EVT VT = N->getValueType(0);
	if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 \|\| VT == MVT::v4i32)) \|\|
	(Subtarget.hasInt256() && (VT == MVT::v16i16 \|\| VT == MVT::v8i32))) &&
	isHorizontalBinOp(Op0, Op1, false))
	return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);

	if (SDValue V = combineIncDecVector(N, DAG))
	return V;

	return combineAddOrSubToADCOrSBB(N, DAG);
	}

	static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (DCI.isBeforeLegalize())
	return SDValue();

	SDLoc DL(N);
	unsigned Opcode = N->getOpcode();
	MVT VT = N->getSimpleValueType(0);
	MVT SVT = VT.getVectorElementType();
	unsigned NumElts = VT.getVectorNumElements();
	unsigned EltSizeInBits = SVT.getSizeInBits();

	SDValue Op = N->getOperand(0);
	MVT OpVT = Op.getSimpleValueType();
	MVT OpEltVT = OpVT.getVectorElementType();
	unsigned OpEltSizeInBits = OpEltVT.getSizeInBits();
	unsigned InputBits = OpEltSizeInBits * NumElts;

	// Perform any constant folding.
	// FIXME: Reduce constant pool usage and don't fold when OptSize is enabled.
	APInt UndefElts;
	SmallVector<APInt, 64> EltBits;
	if (getTargetConstantBitsFromNode(Op, OpEltSizeInBits, UndefElts, EltBits)) {
	APInt Undefs(NumElts, 0);
	SmallVector<APInt, 4> Vals(NumElts, APInt(EltSizeInBits, 0));
	bool IsZEXT =
	(Opcode == X86ISD::VZEXT) \|\| (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG);
	for (unsigned i = 0; i != NumElts; ++i) {
	if (UndefElts[i]) {
	Undefs.setBit(i);
	continue;
	}
	Vals[i] = IsZEXT ? EltBits[i].zextOrTrunc(EltSizeInBits)
	: EltBits[i].sextOrTrunc(EltSizeInBits);
	}
	return getConstVector(Vals, Undefs, VT, DAG, DL);
	}

	// (vzext (bitcast (vzext (x)) -> (vzext x)
	// TODO: (vsext (bitcast (vsext (x)) -> (vsext x)
	SDValue V = peekThroughBitcasts(Op);
	if (Opcode == X86ISD::VZEXT && V != Op && V.getOpcode() == X86ISD::VZEXT) {
	MVT InnerVT = V.getSimpleValueType();
	MVT InnerEltVT = InnerVT.getVectorElementType();

	// If the element sizes match exactly, we can just do one larger vzext. This
	// is always an exact type match as vzext operates on integer types.
	if (OpEltVT == InnerEltVT) {
	assert(OpVT == InnerVT && "Types must match for vzext!");
	return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
	}

	// The only other way we can combine them is if only a single element of the
	// inner vzext is used in the input to the outer vzext.
	if (InnerEltVT.getSizeInBits() < InputBits)
	return SDValue();

	// In this case, the inner vzext is completely dead because we're going to
	// only look at bits inside of the low element. Just do the outer vzext on
	// a bitcast of the input to the inner.
	return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V));
	}

	// Check if we can bypass extracting and re-inserting an element of an input
	// vector. Essentially:
	// (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
	// TODO: Add X86ISD::VSEXT support
	if (Opcode == X86ISD::VZEXT &&
	V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
	V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
	SDValue ExtractedV = V.getOperand(0);
	SDValue OrigV = ExtractedV.getOperand(0);
	if (isNullConstant(ExtractedV.getOperand(1))) {
	MVT OrigVT = OrigV.getSimpleValueType();
	// Extract a subvector if necessary...
	if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
	int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
	OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
	OrigVT.getVectorNumElements() / Ratio);
	OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
	DAG.getIntPtrConstant(0, DL));
	}
	Op = DAG.getBitcast(OpVT, OrigV);
	return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
	}
	}

	return SDValue();
	}

	/// Canonicalize (LSUB p, 1) -> (LADD p, -1).
	static SDValue combineLockSub(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue Chain = N->getOperand(0);
	SDValue LHS = N->getOperand(1);
	SDValue RHS = N->getOperand(2);
	MVT VT = RHS.getSimpleValueType();
	SDLoc DL(N);

	auto *C = dyn_cast<ConstantSDNode>(RHS);
	if (!C \|\| C->getZExtValue() != 1)
	return SDValue();

	RHS = DAG.getConstant(-1, DL, VT);
	MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
	return DAG.getMemIntrinsicNode(X86ISD::LADD, DL,
	DAG.getVTList(MVT::i32, MVT::Other),
	{Chain, LHS, RHS}, VT, MMO);
	}

	// TEST (AND a, b) ,(AND a, b) -> TEST a, b
	static SDValue combineTestM(SDNode *N, SelectionDAG &DAG) {
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);

	if (Op0 != Op1 \|\| Op1->getOpcode() != ISD::AND)
	return SDValue();

	EVT VT = N->getValueType(0);
	SDLoc DL(N);

	return DAG.getNode(X86ISD::TESTM, DL, VT,
	Op0->getOperand(0), Op0->getOperand(1));
	}

	static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = N->getSimpleValueType(0);
	SDLoc DL(N);

	if (N->getOperand(0) == N->getOperand(1)) {
	if (N->getOpcode() == X86ISD::PCMPEQ)
	return getOnesVector(VT, DAG, DL);
	if (N->getOpcode() == X86ISD::PCMPGT)
	return getZeroVector(VT, Subtarget, DAG, DL);
	}

	return SDValue();
	}

	static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	SDLoc dl(N);
	SDValue Vec = N->getOperand(0);
	SDValue SubVec = N->getOperand(1);
	SDValue Idx = N->getOperand(2);

	unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
	MVT OpVT = N->getSimpleValueType(0);
	MVT SubVecVT = SubVec.getSimpleValueType();

	// If this is an insert of an extract, combine to a shuffle. Don't do this
	// if the insert or extract can be represented with a subvector operation.
	if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	SubVec.getOperand(0).getSimpleValueType() == OpVT &&
	(IdxVal != 0 \|\| !Vec.isUndef())) {
	int ExtIdxVal = cast<ConstantSDNode>(SubVec.getOperand(1))->getZExtValue();
	if (ExtIdxVal != 0) {
	int VecNumElts = OpVT.getVectorNumElements();
	int SubVecNumElts = SubVecVT.getVectorNumElements();
	SmallVector<int, 64> Mask(VecNumElts);
	// First create an identity shuffle mask.
	for (int i = 0; i != VecNumElts; ++i)
	Mask[i] = i;
	// Now insert the extracted portion.
	for (int i = 0; i != SubVecNumElts; ++i)
	Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;

	return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
	}
	}

	// Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte
	// load:
	// (insert_subvector (insert_subvector undef, (load16 addr), 0),
	// (load16 addr + 16), Elts/2)
	// --> load32 addr
	// or:
	// (insert_subvector (insert_subvector undef, (load32 addr), 0),
	// (load32 addr + 32), Elts/2)
	// --> load64 addr
	// or a 16-byte or 32-byte broadcast:
	// (insert_subvector (insert_subvector undef, (load16 addr), 0),
	// (load16 addr), Elts/2)
	// --> X86SubVBroadcast(load16 addr)
	// or:
	// (insert_subvector (insert_subvector undef, (load32 addr), 0),
	// (load32 addr), Elts/2)
	// --> X86SubVBroadcast(load32 addr)
	if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
	Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
	OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) {
	auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
	if (Idx2 && Idx2->getZExtValue() == 0) {
	SDValue SubVec2 = Vec.getOperand(1);
	// If needed, look through bitcasts to get to the load.
	if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) {
	bool Fast;
	unsigned Alignment = FirstLd->getAlignment();
	unsigned AS = FirstLd->getAddressSpace();
	const X86TargetLowering *TLI = Subtarget.getTargetLowering();
	if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
	OpVT, AS, Alignment, &Fast) && Fast) {
	SDValue Ops[] = {SubVec2, SubVec};
	if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG,
	Subtarget, false))
	return Ld;
	}
	}
	// If lower/upper loads are the same and the only users of the load, then
	// lower to a VBROADCASTF128/VBROADCASTI128/etc.
	if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2))) {
	if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) &&
	SDNode::areOnlyUsersOf({N, Vec.getNode()}, SubVec2.getNode())) {
	return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);
	}
	}
	// If this is subv_broadcast insert into both halves, use a larger
	// subv_broadcast.
	if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2) {
	return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT,
	SubVec.getOperand(0));
	}
	}
	}

	return SDValue();
	}


	SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
	DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;
	switch (N->getOpcode()) {
	default: break;
	case ISD::EXTRACT_VECTOR_ELT:
	return combineExtractVectorElt(N, DAG, DCI, Subtarget);
	case X86ISD::PEXTRW:
	case X86ISD::PEXTRB:
	return combineExtractVectorElt_SSE(N, DAG, DCI, Subtarget);
	case ISD::INSERT_SUBVECTOR:
	return combineInsertSubvector(N, DAG, DCI, Subtarget);
	case ISD::VSELECT:
	case ISD::SELECT:
	case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget);
	case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
	case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
	case ISD::ADD: return combineAdd(N, DAG, Subtarget);
	case ISD::SUB: return combineSub(N, DAG, Subtarget);
	case X86ISD::ADD: return combineX86ADD(N, DAG, DCI);
	case X86ISD::ADC: return combineADC(N, DAG, DCI);
	case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
	case ISD::SHL:
	case ISD::SRA:
	case ISD::SRL: return combineShift(N, DAG, DCI, Subtarget);
	case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
	case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
	case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
	case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
	case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
	case ISD::STORE: return combineStore(N, DAG, Subtarget);
	case ISD::MSTORE: return combineMaskedStore(N, DAG, Subtarget);
	case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, Subtarget);
	case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget);
	case ISD::FADD:
	case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
	case ISD::FNEG: return combineFneg(N, DAG, Subtarget);
	case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
	case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
	case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
	case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
	case X86ISD::FXOR:
	case X86ISD::FOR: return combineFOr(N, DAG, Subtarget);
	case X86ISD::FMIN:
	case X86ISD::FMAX: return combineFMinFMax(N, DAG);
	case ISD::FMINNUM:
	case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
	case X86ISD::BT: return combineBT(N, DAG, DCI);
	case ISD::ANY_EXTEND:
	case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
	case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
	case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
	case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
	case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
	case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
	case X86ISD::VSHLI:
	case X86ISD::VSRAI:
	case X86ISD::VSRLI:
	return combineVectorShiftImm(N, DAG, DCI, Subtarget);
	case ISD::SIGN_EXTEND_VECTOR_INREG:
	case ISD::ZERO_EXTEND_VECTOR_INREG:
	case X86ISD::VSEXT:
	case X86ISD::VZEXT: return combineVSZext(N, DAG, DCI, Subtarget);
	case X86ISD::PINSRB:
	case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
	case X86ISD::SHUFP: // Handle all target specific shuffles
	case X86ISD::INSERTPS:
	case X86ISD::EXTRQI:
	case X86ISD::INSERTQI:
	case X86ISD::PALIGNR:
	case X86ISD::VSHLDQ:
	case X86ISD::VSRLDQ:
	case X86ISD::BLENDI:
	case X86ISD::UNPCKH:
	case X86ISD::UNPCKL:
	case X86ISD::MOVHLPS:
	case X86ISD::MOVLHPS:
	case X86ISD::PSHUFB:
	case X86ISD::PSHUFD:
	case X86ISD::PSHUFHW:
	case X86ISD::PSHUFLW:
	case X86ISD::MOVSHDUP:
	case X86ISD::MOVSLDUP:
	case X86ISD::MOVDDUP:
	case X86ISD::MOVSS:
	case X86ISD::MOVSD:
	case X86ISD::VPPERM:
	case X86ISD::VPERMI:
	case X86ISD::VPERMV:
	case X86ISD::VPERMV3:
	case X86ISD::VPERMIV3:
	case X86ISD::VPERMIL2:
	case X86ISD::VPERMILPI:
	case X86ISD::VPERMILPV:
	case X86ISD::VPERM2X128:
	case X86ISD::VZEXT_MOVL:
	case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
	case X86ISD::FMADD:
	case X86ISD::FMADD_RND:
	case X86ISD::FMADDS1_RND:
	case X86ISD::FMADDS3_RND:
	case ISD::FMA: return combineFMA(N, DAG, Subtarget);
	case ISD::MGATHER:
	case ISD::MSCATTER: return combineGatherScatter(N, DAG);
	case X86ISD::LSUB: return combineLockSub(N, DAG, Subtarget);
	case X86ISD::TESTM: return combineTestM(N, DAG);
	case X86ISD::PCMPEQ:
	case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
	}

	return SDValue();
	}

	/// Return true if the target has native support for the specified value type
	/// and it is 'desirable' to use the type for the given node type. e.g. On x86
	/// i16 is legal, but undesirable since i16 instruction encodings are longer and
	/// some i16 instructions are slow.
	bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
	if (!isTypeLegal(VT))
	return false;
	if (VT != MVT::i16)
	return true;

	switch (Opc) {
	default:
	return true;
	case ISD::LOAD:
	case ISD::SIGN_EXTEND:
	case ISD::ZERO_EXTEND:
	case ISD::ANY_EXTEND:
	case ISD::SHL:
	case ISD::SRL:
	case ISD::SUB:
	case ISD::ADD:
	case ISD::MUL:
	case ISD::AND:
	case ISD::OR:
	case ISD::XOR:
	return false;
	}
	}

	/// This function checks if any of the users of EFLAGS copies the EFLAGS. We
	/// know that the code that lowers COPY of EFLAGS has to use the stack, and if
	/// we don't adjust the stack we clobber the first frame index.
	/// See X86InstrInfo::copyPhysReg.
	static bool hasCopyImplyingStackAdjustment(const MachineFunction &MF) {
	const MachineRegisterInfo &MRI = MF.getRegInfo();
	return any_of(MRI.reg_instructions(X86::EFLAGS),
	[](const MachineInstr &RI) { return RI.isCopy(); });
	}

	void X86TargetLowering::finalizeLowering(MachineFunction &MF) const {
	if (hasCopyImplyingStackAdjustment(MF)) {
	MachineFrameInfo &MFI = MF.getFrameInfo();
	MFI.setHasCopyImplyingStackAdjustment(true);
	}

	TargetLoweringBase::finalizeLowering(MF);
	}

	/// This method query the target whether it is beneficial for dag combiner to
	/// promote the specified node. If true, it should return the desired promotion
	/// type by reference.
	bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
	EVT VT = Op.getValueType();
	if (VT != MVT::i16)
	return false;

	bool Promote = false;
	bool Commute = false;
	switch (Op.getOpcode()) {
	default: break;
	case ISD::SIGN_EXTEND:
	case ISD::ZERO_EXTEND:
	case ISD::ANY_EXTEND:
	Promote = true;
	break;
	case ISD::SHL:
	case ISD::SRL: {
	SDValue N0 = Op.getOperand(0);
	// Look out for (store (shl (load), x)).
	if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
	return false;
	Promote = true;
	break;
	}
	case ISD::ADD:
	case ISD::MUL:
	case ISD::AND:
	case ISD::OR:
	case ISD::XOR:
	Commute = true;
	LLVM_FALLTHROUGH;
	case ISD::SUB: {
	SDValue N0 = Op.getOperand(0);
	SDValue N1 = Op.getOperand(1);
	if (!Commute && MayFoldLoad(N1))
	return false;
	// Avoid disabling potential load folding opportunities.
	if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) \|\| MayFoldIntoStore(Op)))
	return false;
	if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) \|\| MayFoldIntoStore(Op)))
	return false;
	Promote = true;
	}
	}

	PVT = MVT::i32;
	return Promote;
	}

	//===----------------------------------------------------------------------===//
	// X86 Inline Assembly Support
	//===----------------------------------------------------------------------===//

	// Helper to match a string separated by whitespace.
	static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
	S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.

	for (StringRef Piece : Pieces) {
	if (!S.startswith(Piece)) // Check if the piece matches.
	return false;

	S = S.substr(Piece.size());
	StringRef::size_type Pos = S.find_first_not_of(" \t");
	if (Pos == 0) // We matched a prefix.
	return false;

	S = S.substr(Pos);
	}

	return S.empty();
	}

	static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {

	if (AsmPieces.size() == 3 \|\| AsmPieces.size() == 4) {
	if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
	std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
	std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {

	if (AsmPieces.size() == 3)
	return true;
	else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
	return true;
	}
	}
	return false;
	}

	bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
	InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());

	const std::string &AsmStr = IA->getAsmString();

	IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
	if (!Ty \|\| Ty->getBitWidth() % 16 != 0)
	return false;

	// TODO: should remove alternatives from the asmstring: "foo {a\|b}" -> "foo a"
	SmallVector<StringRef, 4> AsmPieces;
	SplitString(AsmStr, AsmPieces, ";\n");

	switch (AsmPieces.size()) {
	default: return false;
	case 1:
	// FIXME: this should verify that we are targeting a 486 or better. If not,
	// we will turn this bswap into something that will be lowered to logical
	// ops instead of emitting the bswap asm. For now, we don't support 486 or
	// lower so don't worry about this.
	// bswap $0
	if (matchAsm(AsmPieces[0], {"bswap", "$0"}) \|\|
	matchAsm(AsmPieces[0], {"bswapl", "$0"}) \|\|
	matchAsm(AsmPieces[0], {"bswapq", "$0"}) \|\|
	matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) \|\|
	matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) \|\|
	matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
	// No need to check constraints, nothing other than the equivalent of
	// "=r,0" would be valid here.
	return IntrinsicLowering::LowerToByteSwap(CI);
	}

	// rorw $$8, ${0:w} --> llvm.bswap.i16
	if (CI->getType()->isIntegerTy(16) &&
	IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
	(matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) \|\|
	matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
	AsmPieces.clear();
	StringRef ConstraintsStr = IA->getConstraintString();
	SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
	array_pod_sort(AsmPieces.begin(), AsmPieces.end());
	if (clobbersFlagRegisters(AsmPieces))
	return IntrinsicLowering::LowerToByteSwap(CI);
	}
	break;
	case 3:
	if (CI->getType()->isIntegerTy(32) &&
	IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
	matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
	matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
	matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
	AsmPieces.clear();
	StringRef ConstraintsStr = IA->getConstraintString();
	SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
	array_pod_sort(AsmPieces.begin(), AsmPieces.end());
	if (clobbersFlagRegisters(AsmPieces))
	return IntrinsicLowering::LowerToByteSwap(CI);
	}

	if (CI->getType()->isIntegerTy(64)) {
	InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
	if (Constraints.size() >= 2 &&
	Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
	Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
	// bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
	if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
	matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
	matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
	return IntrinsicLowering::LowerToByteSwap(CI);
	}
	}
	break;
	}
	return false;
	}

	/// Given a constraint letter, return the type of constraint for this target.
	X86TargetLowering::ConstraintType
	X86TargetLowering::getConstraintType(StringRef Constraint) const {
	if (Constraint.size() == 1) {
	switch (Constraint[0]) {
	case 'R':
	case 'q':
	case 'Q':
	case 'f':
	case 't':
	case 'u':
	case 'y':
	case 'x':
	case 'v':
	case 'Y':
	case 'l':
	return C_RegisterClass;
	case 'k': // AVX512 masking registers.
	case 'a':
	case 'b':
	case 'c':
	case 'd':
	case 'S':
	case 'D':
	case 'A':
	return C_Register;
	case 'I':
	case 'J':
	case 'K':
	case 'L':
	case 'M':
	case 'N':
	case 'G':
	case 'C':
	case 'e':
	case 'Z':
	return C_Other;
	default:
	break;
	}
	}
	else if (Constraint.size() == 2) {
	switch (Constraint[0]) {
	default:
	break;
	case 'Y':
	switch (Constraint[1]) {
	default:
	break;
	case 'k':
	return C_Register;
	}
	}
	}
	return TargetLowering::getConstraintType(Constraint);
	}

	/// Examine constraint type and operand type and determine a weight value.
	/// This object must already have been set up with the operand type
	/// and the current alternative constraint selected.
	TargetLowering::ConstraintWeight
	X86TargetLowering::getSingleConstraintMatchWeight(
	AsmOperandInfo &info, const char *constraint) const {
	ConstraintWeight weight = CW_Invalid;
	Value *CallOperandVal = info.CallOperandVal;
	// If we don't have a value, we can't do a match,
	// but allow it at the lowest weight.
	if (!CallOperandVal)
	return CW_Default;
	Type *type = CallOperandVal->getType();
	// Look at the constraint type.
	switch (*constraint) {
	default:
	weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
	LLVM_FALLTHROUGH;
	case 'R':
	case 'q':
	case 'Q':
	case 'a':
	case 'b':
	case 'c':
	case 'd':
	case 'S':
	case 'D':
	case 'A':
	if (CallOperandVal->getType()->isIntegerTy())
	weight = CW_SpecificReg;
	break;
	case 'f':
	case 't':
	case 'u':
	if (type->isFloatingPointTy())
	weight = CW_SpecificReg;
	break;
	case 'y':
	if (type->isX86_MMXTy() && Subtarget.hasMMX())
	weight = CW_SpecificReg;
	break;
	case 'Y':
	// Other "Y<x>" (e.g. "Yk") constraints should be implemented below.
	if (constraint[1] == 'k') {
	// Support for 'Yk' (similarly to the 'k' variant below).
	weight = CW_SpecificReg;
	break;
	}
	// Else fall through (handle "Y" constraint).
	LLVM_FALLTHROUGH;
	case 'v':
	if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
	weight = CW_Register;
	LLVM_FALLTHROUGH;
	case 'x':
	if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) \|\|
	((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasFp256()))
	weight = CW_Register;
	break;
	case 'k':
	// Enable conditional vector operations using %k<#> registers.
	weight = CW_SpecificReg;
	break;
	case 'I':
	if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
	if (C->getZExtValue() <= 31)
	weight = CW_Constant;
	}
	break;
	case 'J':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if (C->getZExtValue() <= 63)
	weight = CW_Constant;
	}
	break;
	case 'K':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
	weight = CW_Constant;
	}
	break;
	case 'L':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if ((C->getZExtValue() == 0xff) \|\| (C->getZExtValue() == 0xffff))
	weight = CW_Constant;
	}
	break;
	case 'M':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if (C->getZExtValue() <= 3)
	weight = CW_Constant;
	}
	break;
	case 'N':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if (C->getZExtValue() <= 0xff)
	weight = CW_Constant;
	}
	break;
	case 'G':
	case 'C':
	if (isa<ConstantFP>(CallOperandVal)) {
	weight = CW_Constant;
	}
	break;
	case 'e':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if ((C->getSExtValue() >= -0x80000000LL) &&
	(C->getSExtValue() <= 0x7fffffffLL))
	weight = CW_Constant;
	}
	break;
	case 'Z':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if (C->getZExtValue() <= 0xffffffff)
	weight = CW_Constant;
	}
	break;
	}
	return weight;
	}

	/// Try to replace an X constraint, which matches anything, with another that
	/// has more specific requirements based on the type of the corresponding
	/// operand.
	const char *X86TargetLowering::
	LowerXConstraint(EVT ConstraintVT) const {
	// FP X constraints get lowered to SSE1/2 registers if available, otherwise
	// 'f' like normal targets.
	if (ConstraintVT.isFloatingPoint()) {
	if (Subtarget.hasSSE2())
	return "Y";
	if (Subtarget.hasSSE1())
	return "x";
	}

	return TargetLowering::LowerXConstraint(ConstraintVT);
	}

	/// Lower the specified operand into the Ops vector.
	/// If it is invalid, don't add anything to Ops.
	void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
	std::string &Constraint,
	std::vector<SDValue>&Ops,
	SelectionDAG &DAG) const {
	SDValue Result;

	// Only support length 1 constraints for now.
	if (Constraint.length() > 1) return;

	char ConstraintLetter = Constraint[0];
	switch (ConstraintLetter) {
	default: break;
	case 'I':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() <= 31) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'J':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() <= 63) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'K':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (isInt<8>(C->getSExtValue())) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'L':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() == 0xff \|\| C->getZExtValue() == 0xffff \|\|
	(Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
	Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'M':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() <= 3) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'N':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() <= 255) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'O':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() <= 127) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'e': {
	// 32-bit signed value
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
	C->getSExtValue())) {
	// Widen to 64 bits here to get it sign extended.
	Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
	break;
	}
	// FIXME gcc accepts some relocatable values here too, but only in certain
	// memory models; it's complicated.
	}
	return;
	}
	case 'Z': {
	// 32-bit unsigned value
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
	C->getZExtValue())) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	// FIXME gcc accepts some relocatable values here too, but only in certain
	// memory models; it's complicated.
	return;
	}
	case 'i': {
	// Literal immediates are always ok.
	if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
	// Widen to 64 bits here to get it sign extended.
	Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64);
	break;
	}

	// In any sort of PIC mode addresses need to be computed at runtime by
	// adding in a register or some sort of table lookup. These can't
	// be used as immediates.
	if (Subtarget.isPICStyleGOT() \|\| Subtarget.isPICStyleStubPIC())
	return;

	// If we are in non-pic codegen mode, we allow the address of a global (with
	// an optional displacement) to be used with 'i'.
	GlobalAddressSDNode *GA = nullptr;
	int64_t Offset = 0;

	// Match either (GA), (GA+C), (GA+C1+C2), etc.
	while (1) {
	if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
	Offset += GA->getOffset();
	break;
	} else if (Op.getOpcode() == ISD::ADD) {
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
	Offset += C->getZExtValue();
	Op = Op.getOperand(0);
	continue;
	}
	} else if (Op.getOpcode() == ISD::SUB) {
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
	Offset += -C->getZExtValue();
	Op = Op.getOperand(0);
	continue;
	}
	}

	// Otherwise, this isn't something we can handle, reject it.
	return;
	}

	const GlobalValue *GV = GA->getGlobal();
	// If we require an extra load to get this address, as in PIC mode, we
	// can't accept it.
	if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV)))
	return;

	Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
	GA->getValueType(0), Offset);
	break;
	}
	}

	if (Result.getNode()) {
	Ops.push_back(Result);
	return;
	}
	return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
	}

	/// Check if \p RC is a general purpose register class.
	/// I.e., GR* or one of their variant.
	static bool isGRClass(const TargetRegisterClass &RC) {
	return RC.hasSuperClassEq(&X86::GR8RegClass) \|\|
	RC.hasSuperClassEq(&X86::GR16RegClass) \|\|
	RC.hasSuperClassEq(&X86::GR32RegClass) \|\|
	RC.hasSuperClassEq(&X86::GR64RegClass) \|\|
	RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
	}

	/// Check if \p RC is a vector register class.
	/// I.e., FR* / VR* or one of their variant.
	static bool isFRClass(const TargetRegisterClass &RC) {
	return RC.hasSuperClassEq(&X86::FR32XRegClass) \|\|
	RC.hasSuperClassEq(&X86::FR64XRegClass) \|\|
	RC.hasSuperClassEq(&X86::VR128XRegClass) \|\|
	RC.hasSuperClassEq(&X86::VR256XRegClass) \|\|
	RC.hasSuperClassEq(&X86::VR512RegClass);
	}

	std::pair<unsigned, const TargetRegisterClass *>
	X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
	StringRef Constraint,
	MVT VT) const {
	// First, see if this is a constraint that directly corresponds to an LLVM
	// register class.
	if (Constraint.size() == 1) {
	// GCC Constraint Letters
	switch (Constraint[0]) {
	default: break;
	// TODO: Slight differences here in allocation order and leaving
	// RIP in the class. Do they matter any more here than they do
	// in the normal allocation?
	case 'k':
	if (Subtarget.hasAVX512()) {
	// Only supported in AVX512 or later.
	switch (VT.SimpleTy) {
	default: break;
	case MVT::i32:
	return std::make_pair(0U, &X86::VK32RegClass);
	case MVT::i16:
	return std::make_pair(0U, &X86::VK16RegClass);
	case MVT::i8:
	return std::make_pair(0U, &X86::VK8RegClass);
	case MVT::i1:
	return std::make_pair(0U, &X86::VK1RegClass);
	case MVT::i64:
	return std::make_pair(0U, &X86::VK64RegClass);
	}
	}
	break;
	case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
	if (Subtarget.is64Bit()) {
	if (VT == MVT::i32 \|\| VT == MVT::f32)
	return std::make_pair(0U, &X86::GR32RegClass);
	if (VT == MVT::i16)
	return std::make_pair(0U, &X86::GR16RegClass);
	if (VT == MVT::i8 \|\| VT == MVT::i1)
	return std::make_pair(0U, &X86::GR8RegClass);
	if (VT == MVT::i64 \|\| VT == MVT::f64)
	return std::make_pair(0U, &X86::GR64RegClass);
	break;
	}
	LLVM_FALLTHROUGH;
	// 32-bit fallthrough
	case 'Q': // Q_REGS
	if (VT == MVT::i32 \|\| VT == MVT::f32)
	return std::make_pair(0U, &X86::GR32_ABCDRegClass);
	if (VT == MVT::i16)
	return std::make_pair(0U, &X86::GR16_ABCDRegClass);
	if (VT == MVT::i8 \|\| VT == MVT::i1)
	return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
	if (VT == MVT::i64)
	return std::make_pair(0U, &X86::GR64_ABCDRegClass);
	break;
	case 'r': // GENERAL_REGS
	case 'l': // INDEX_REGS
	if (VT == MVT::i8 \|\| VT == MVT::i1)
	return std::make_pair(0U, &X86::GR8RegClass);
	if (VT == MVT::i16)
	return std::make_pair(0U, &X86::GR16RegClass);
	if (VT == MVT::i32 \|\| VT == MVT::f32 \|\| !Subtarget.is64Bit())
	return std::make_pair(0U, &X86::GR32RegClass);
	return std::make_pair(0U, &X86::GR64RegClass);
	case 'R': // LEGACY_REGS
	if (VT == MVT::i8 \|\| VT == MVT::i1)
	return std::make_pair(0U, &X86::GR8_NOREXRegClass);
	if (VT == MVT::i16)
	return std::make_pair(0U, &X86::GR16_NOREXRegClass);
	if (VT == MVT::i32 \|\| !Subtarget.is64Bit())
	return std::make_pair(0U, &X86::GR32_NOREXRegClass);
	return std::make_pair(0U, &X86::GR64_NOREXRegClass);
	case 'f': // FP Stack registers.
	// If SSE is enabled for this VT, use f80 to ensure the isel moves the
	// value to the correct fpstack register class.
	if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
	return std::make_pair(0U, &X86::RFP32RegClass);
	if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
	return std::make_pair(0U, &X86::RFP64RegClass);
	return std::make_pair(0U, &X86::RFP80RegClass);
	case 'y': // MMX_REGS if MMX allowed.
	if (!Subtarget.hasMMX()) break;
	return std::make_pair(0U, &X86::VR64RegClass);
	case 'Y': // SSE_REGS if SSE2 allowed
	if (!Subtarget.hasSSE2()) break;
	LLVM_FALLTHROUGH;
	case 'v':
	case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
	if (!Subtarget.hasSSE1()) break;
	bool VConstraint = (Constraint[0] == 'v');

	switch (VT.SimpleTy) {
	default: break;
	// Scalar SSE types.
	case MVT::f32:
	case MVT::i32:
	if (VConstraint && Subtarget.hasAVX512() && Subtarget.hasVLX())
	return std::make_pair(0U, &X86::FR32XRegClass);
	return std::make_pair(0U, &X86::FR32RegClass);
	case MVT::f64:
	case MVT::i64:
	if (VConstraint && Subtarget.hasVLX())
	return std::make_pair(0U, &X86::FR64XRegClass);
	return std::make_pair(0U, &X86::FR64RegClass);
	// TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
	// Vector types.
	case MVT::v16i8:
	case MVT::v8i16:
	case MVT::v4i32:
	case MVT::v2i64:
	case MVT::v4f32:
	case MVT::v2f64:
	if (VConstraint && Subtarget.hasVLX())
	return std::make_pair(0U, &X86::VR128XRegClass);
	return std::make_pair(0U, &X86::VR128RegClass);
	// AVX types.
	case MVT::v32i8:
	case MVT::v16i16:
	case MVT::v8i32:
	case MVT::v4i64:
	case MVT::v8f32:
	case MVT::v4f64:
	if (VConstraint && Subtarget.hasVLX())
	return std::make_pair(0U, &X86::VR256XRegClass);
	return std::make_pair(0U, &X86::VR256RegClass);
	case MVT::v8f64:
	case MVT::v16f32:
	case MVT::v16i32:
	case MVT::v8i64:
	return std::make_pair(0U, &X86::VR512RegClass);
	}
	break;
	}
	} else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
	switch (Constraint[1]) {
	default:
	break;
	case 'k':
	// This register class doesn't allocate k0 for masked vector operation.
	if (Subtarget.hasAVX512()) { // Only supported in AVX512.
	switch (VT.SimpleTy) {
	default: break;
	case MVT::i32:
	return std::make_pair(0U, &X86::VK32WMRegClass);
	case MVT::i16:
	return std::make_pair(0U, &X86::VK16WMRegClass);
	case MVT::i8:
	return std::make_pair(0U, &X86::VK8WMRegClass);
	case MVT::i1:
	return std::make_pair(0U, &X86::VK1WMRegClass);
	case MVT::i64:
	return std::make_pair(0U, &X86::VK64WMRegClass);
	}
	}
	break;
	}
	}

	// Use the default implementation in TargetLowering to convert the register
	// constraint into a member of a register class.
	std::pair<unsigned, const TargetRegisterClass*> Res;
	Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);

	// Not found as a standard register?
	if (!Res.second) {
	// Map st(0) -> st(7) -> ST0
	if (Constraint.size() == 7 && Constraint[0] == '{' &&
	tolower(Constraint[1]) == 's' &&
	tolower(Constraint[2]) == 't' &&
	Constraint[3] == '(' &&
	(Constraint[4] >= '0' && Constraint[4] <= '7') &&
	Constraint[5] == ')' &&
	Constraint[6] == '}') {

	Res.first = X86::FP0+Constraint[4]-'0';
	Res.second = &X86::RFP80RegClass;
	return Res;
	}

	// GCC allows "st(0)" to be called just plain "st".
	if (StringRef("{st}").equals_lower(Constraint)) {
	Res.first = X86::FP0;
	Res.second = &X86::RFP80RegClass;
	return Res;
	}

	// flags -> EFLAGS
	if (StringRef("{flags}").equals_lower(Constraint)) {
	Res.first = X86::EFLAGS;
	Res.second = &X86::CCRRegClass;
	return Res;
	}

	// 'A' means [ER]AX + [ER]DX.
	if (Constraint == "A") {
	if (Subtarget.is64Bit()) {
	Res.first = X86::RAX;
	Res.second = &X86::GR64_ADRegClass;
	} else {
	assert((Subtarget.is32Bit() \|\| Subtarget.is16Bit()) &&
	"Expecting 64, 32 or 16 bit subtarget");
	Res.first = X86::EAX;
	Res.second = &X86::GR32_ADRegClass;
	}
	return Res;
	}
	return Res;
	}

	// Otherwise, check to see if this is a register class of the wrong value
	// type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
	// turn into {ax},{dx}.
	// MVT::Other is used to specify clobber names.
	if (TRI->isTypeLegalForClass(*Res.second, VT) \|\| VT == MVT::Other)
	return Res; // Correct type already, nothing to do.

	// Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
	// return "eax". This should even work for things like getting 64bit integer
	// registers when given an f64 type.
	const TargetRegisterClass *Class = Res.second;
	// The generic code will match the first register class that contains the
	// given register. Thus, based on the ordering of the tablegened file,
	// the "plain" GR classes might not come first.
	// Therefore, use a helper method.
	if (isGRClass(*Class)) {
	unsigned Size = VT.getSizeInBits();
	if (Size == 1) Size = 8;
	unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
	if (DestReg > 0) {
	Res.first = DestReg;
	Res.second = Size == 8 ? &X86::GR8RegClass
	: Size == 16 ? &X86::GR16RegClass
	: Size == 32 ? &X86::GR32RegClass
	: &X86::GR64RegClass;
	assert(Res.second->contains(Res.first) && "Register in register class");
	} else {
	// No register found/type mismatch.
	Res.first = 0;
	Res.second = nullptr;
	}
	} else if (isFRClass(*Class)) {
	// Handle references to XMM physical registers that got mapped into the
	// wrong class. This can happen with constraints like {xmm0} where the
	// target independent register mapper will just pick the first match it can
	// find, ignoring the required type.

	// TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
	if (VT == MVT::f32 \|\| VT == MVT::i32)
	Res.second = &X86::FR32RegClass;
	else if (VT == MVT::f64 \|\| VT == MVT::i64)
	Res.second = &X86::FR64RegClass;
	else if (TRI->isTypeLegalForClass(X86::VR128RegClass, VT))
	Res.second = &X86::VR128RegClass;
	else if (TRI->isTypeLegalForClass(X86::VR256RegClass, VT))
	Res.second = &X86::VR256RegClass;
	else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
	Res.second = &X86::VR512RegClass;
	else {
	// Type mismatch and not a clobber: Return an error;
	Res.first = 0;
	Res.second = nullptr;
	}
	}

	return Res;
	}

	int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
	const AddrMode &AM, Type *Ty,
	unsigned AS) const {
	// Scaling factors are not free at all.
	// An indexed folded instruction, i.e., inst (reg1, reg2, scale),
	// will take 2 allocations in the out of order engine instead of 1
	// for plain addressing mode, i.e. inst (reg1).
	// E.g.,
	// vaddps (%rsi,%drx), %ymm0, %ymm1
	// Requires two allocations (one for the load, one for the computation)
	// whereas:
	// vaddps (%rsi), %ymm0, %ymm1
	// Requires just 1 allocation, i.e., freeing allocations for other operations
	// and having less micro operations to execute.
	//
	// For some X86 architectures, this is even worse because for instance for
	// stores, the complex addressing mode forces the instruction to use the
	// "load" ports instead of the dedicated "store" port.
	// E.g., on Haswell:
	// vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
	// vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
	if (isLegalAddressingMode(DL, AM, Ty, AS))
	// Scale represents reg2 * scale, thus account for 1
	// as soon as we use a second register.
	return AM.Scale != 0;
	return -1;
	}

	bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
	// Integer division on x86 is expensive. However, when aggressively optimizing
	// for code size, we prefer to use a div instruction, as it is usually smaller
	// than the alternative sequence.
	// The exception to this is vector division. Since x86 doesn't have vector
	// integer division, leaving the division as-is is a loss even in terms of
	// size, because it will have to be scalarized, while the alternative code
	// sequence can be performed in vector form.
	bool OptSize =
	Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
	return OptSize && !VT.isVector();
	}

	void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
	if (!Subtarget.is64Bit())
	return;

	// Update IsSplitCSR in X86MachineFunctionInfo.
	X86MachineFunctionInfo *AFI =
	Entry->getParent()->getInfo<X86MachineFunctionInfo>();
	AFI->setIsSplitCSR(true);
	}

	void X86TargetLowering::insertCopiesSplitCSR(
	MachineBasicBlock *Entry,
	const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
	const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
	if (!IStart)
	return;

	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
	MachineBasicBlock::iterator MBBI = Entry->begin();
	for (const MCPhysReg I = IStart; I; ++I) {
	const TargetRegisterClass *RC = nullptr;
	if (X86::GR64RegClass.contains(*I))
	RC = &X86::GR64RegClass;
	else
	llvm_unreachable("Unexpected register class in CSRsViaCopy!");

	unsigned NewVR = MRI->createVirtualRegister(RC);
	// Create copy from CSR to a virtual register.
	// FIXME: this currently does not emit CFI pseudo-instructions, it works
	// fine for CXX_FAST_TLS since the C++-style TLS access functions should be
	// nounwind. If we want to generalize this later, we may need to emit
	// CFI pseudo-instructions.
	assert(Entry->getParent()->getFunction()->hasFnAttribute(
	Attribute::NoUnwind) &&
	"Function should be nounwind in insertCopiesSplitCSR!");
	Entry->addLiveIn(*I);
	BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
	.addReg(*I);

	// Insert the copy-back instructions right before the terminator.
	for (auto *Exit : Exits)
	BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
	TII->get(TargetOpcode::COPY), *I)
	.addReg(NewVR);
	}
	}

	bool X86TargetLowering::supportSwiftError() const {
	return Subtarget.is64Bit();
	}

	/// Returns the name of the symbol used to emit stack probes or the empty
	/// string if not applicable.
	StringRef X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
	// If the function specifically requests stack probes, emit them.
	if (MF.getFunction()->hasFnAttribute("probe-stack"))
	return MF.getFunction()->getFnAttribute("probe-stack").getValueAsString();

	// Generally, if we aren't on Windows, the platform ABI does not include
	// support for stack probes, so don't emit them.
	if (!Subtarget.isOSWindows() \|\| Subtarget.isTargetMachO())
	return "";

	// We need a stack probe to conform to the Windows ABI. Choose the right
	// symbol.
	if (Subtarget.is64Bit())
	return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
	return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
	}
	diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
	index 705d0f7a5cf7..0e654a380e7c 100644
	--- a/lib/Target/X86/X86InstrAVX512.td
	+++ b/lib/Target/X86/X86InstrAVX512.td
	@@ -1,10213 +1,10244 @@
	//===-- X86InstrAVX512.td - AVX512 Instruction Set ---------- tablegen --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file describes the X86 AVX512 instruction set, defining the
	// instructions, and properties of the instructions which are needed for code
	// generation, machine code emission, and analysis.
	//
	//===----------------------------------------------------------------------===//

	// Group template arguments that can be derived from the vector type (EltNum x
	// EltVT). These are things like the register class for the writemask, etc.
	// The idea is to pass one of these as the template argument rather than the
	// individual arguments.
	// The template is also used for scalar types, in this case numelts is 1.
	class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc,
	string suffix = ""> {
	RegisterClass RC = rc;
	ValueType EltVT = eltvt;
	int NumElts = numelts;

	// Corresponding mask register class.
	RegisterClass KRC = !cast<RegisterClass>("VK" # NumElts);

	// Corresponding write-mask register class.
	RegisterClass KRCWM = !cast<RegisterClass>("VK" # NumElts # "WM");

	// The mask VT.
	ValueType KVT = !cast<ValueType>("v" # NumElts # "i1");

	// Suffix used in the instruction mnemonic.
	string Suffix = suffix;

	// VTName is a string name for vector VT. For vector types it will be
	// v # NumElts # EltVT, so for vector of 8 elements of i32 it will be v8i32
	// It is a little bit complex for scalar types, where NumElts = 1.
	// In this case we build v4f32 or v2f64
	string VTName = "v" # !if (!eq (NumElts, 1),
	!if (!eq (EltVT.Size, 32), 4,
	!if (!eq (EltVT.Size, 64), 2, NumElts)), NumElts) # EltVT;

	// The vector VT.
	ValueType VT = !cast<ValueType>(VTName);

	string EltTypeName = !cast<string>(EltVT);
	// Size of the element type in bits, e.g. 32 for v16i32.
	string EltSizeName = !subst("i", "", !subst("f", "", EltTypeName));
	int EltSize = EltVT.Size;

	// "i" for integer types and "f" for floating-point types
	string TypeVariantName = !subst(EltSizeName, "", EltTypeName);

	// Size of RC in bits, e.g. 512 for VR512.
	int Size = VT.Size;

	// The corresponding memory operand, e.g. i512mem for VR512.
	X86MemOperand MemOp = !cast<X86MemOperand>(TypeVariantName # Size # "mem");
	X86MemOperand ScalarMemOp = !cast<X86MemOperand>(EltVT # "mem");
	// FP scalar memory operand for intrinsics - ssmem/sdmem.
	Operand IntScalarMemOp = !if (!eq (EltTypeName, "f32"), !cast<Operand>("ssmem"),
	!if (!eq (EltTypeName, "f64"), !cast<Operand>("sdmem"), ?));

	// Load patterns
	// Note: For 128/256-bit integer VT we choose loadv2i64/loadv4i64
	// due to load promotion during legalization
	PatFrag LdFrag = !cast<PatFrag>("load" #
	!if (!eq (TypeVariantName, "i"),
	!if (!eq (Size, 128), "v2i64",
	!if (!eq (Size, 256), "v4i64",
	!if (!eq (Size, 512), "v8i64",
	VTName))), VTName));

	PatFrag AlignedLdFrag = !cast<PatFrag>("alignedload" #
	!if (!eq (TypeVariantName, "i"),
	!if (!eq (Size, 128), "v2i64",
	!if (!eq (Size, 256), "v4i64",
	!if (!eq (Size, 512), "v8i64",
	VTName))), VTName));

	PatFrag ScalarLdFrag = !cast<PatFrag>("load" # EltVT);

	ComplexPattern ScalarIntMemCPat = !if (!eq (EltTypeName, "f32"),
	!cast<ComplexPattern>("sse_load_f32"),
	!if (!eq (EltTypeName, "f64"),
	!cast<ComplexPattern>("sse_load_f64"),
	?));

	// The corresponding float type, e.g. v16f32 for v16i32
	// Note: For EltSize < 32, FloatVT is illegal and TableGen
	// fails to compile, so we choose FloatVT = VT
	ValueType FloatVT = !cast<ValueType>(
	!if (!eq (!srl(EltSize,5),0),
	VTName,
	!if (!eq(TypeVariantName, "i"),
	"v" # NumElts # "f" # EltSize,
	VTName)));

	ValueType IntVT = !cast<ValueType>(
	!if (!eq (!srl(EltSize,5),0),
	VTName,
	!if (!eq(TypeVariantName, "f"),
	"v" # NumElts # "i" # EltSize,
	VTName)));
	// The string to specify embedded broadcast in assembly.
	string BroadcastStr = "{1to" # NumElts # "}";

	// 8-bit compressed displacement tuple/subvector format. This is only
	// defined for NumElts <= 8.
	CD8VForm CD8TupleForm = !if (!eq (!srl(NumElts, 4), 0),
	!cast<CD8VForm>("CD8VT" # NumElts), ?);

	SubRegIndex SubRegIdx = !if (!eq (Size, 128), sub_xmm,
	!if (!eq (Size, 256), sub_ymm, ?));

	Domain ExeDomain = !if (!eq (EltTypeName, "f32"), SSEPackedSingle,
	!if (!eq (EltTypeName, "f64"), SSEPackedDouble,
	SSEPackedInt));

	RegisterClass FRC = !if (!eq (EltTypeName, "f32"), FR32X, FR64X);

	// A vector tye of the same width with element type i64. This is used to
	// create patterns for logic ops.
	ValueType i64VT = !cast<ValueType>("v" # !srl(Size, 6) # "i64");

	// A vector type of the same width with element type i32. This is used to
	// create the canonical constant zero node ImmAllZerosV.
	ValueType i32VT = !cast<ValueType>("v" # !srl(Size, 5) # "i32");
	dag ImmAllZerosV = (VT (bitconvert (i32VT immAllZerosV)));

	string ZSuffix = !if (!eq (Size, 128), "Z128",
	!if (!eq (Size, 256), "Z256", "Z"));
	}

	def v64i8_info : X86VectorVTInfo<64, i8, VR512, "b">;
	def v32i16_info : X86VectorVTInfo<32, i16, VR512, "w">;
	def v16i32_info : X86VectorVTInfo<16, i32, VR512, "d">;
	def v8i64_info : X86VectorVTInfo<8, i64, VR512, "q">;
	def v16f32_info : X86VectorVTInfo<16, f32, VR512, "ps">;
	def v8f64_info : X86VectorVTInfo<8, f64, VR512, "pd">;

	// "x" in v32i8x_info means RC = VR256X
	def v32i8x_info : X86VectorVTInfo<32, i8, VR256X, "b">;
	def v16i16x_info : X86VectorVTInfo<16, i16, VR256X, "w">;
	def v8i32x_info : X86VectorVTInfo<8, i32, VR256X, "d">;
	def v4i64x_info : X86VectorVTInfo<4, i64, VR256X, "q">;
	def v8f32x_info : X86VectorVTInfo<8, f32, VR256X, "ps">;
	def v4f64x_info : X86VectorVTInfo<4, f64, VR256X, "pd">;

	def v16i8x_info : X86VectorVTInfo<16, i8, VR128X, "b">;
	def v8i16x_info : X86VectorVTInfo<8, i16, VR128X, "w">;
	def v4i32x_info : X86VectorVTInfo<4, i32, VR128X, "d">;
	def v2i64x_info : X86VectorVTInfo<2, i64, VR128X, "q">;
	def v4f32x_info : X86VectorVTInfo<4, f32, VR128X, "ps">;
	def v2f64x_info : X86VectorVTInfo<2, f64, VR128X, "pd">;

	// We map scalar types to the smallest (128-bit) vector type
	// with the appropriate element type. This allows to use the same masking logic.
	def i32x_info : X86VectorVTInfo<1, i32, GR32, "si">;
	def i64x_info : X86VectorVTInfo<1, i64, GR64, "sq">;
	def f32x_info : X86VectorVTInfo<1, f32, VR128X, "ss">;
	def f64x_info : X86VectorVTInfo<1, f64, VR128X, "sd">;

	class AVX512VLVectorVTInfo<X86VectorVTInfo i512, X86VectorVTInfo i256,
	X86VectorVTInfo i128> {
	X86VectorVTInfo info512 = i512;
	X86VectorVTInfo info256 = i256;
	X86VectorVTInfo info128 = i128;
	}

	def avx512vl_i8_info : AVX512VLVectorVTInfo<v64i8_info, v32i8x_info,
	v16i8x_info>;
	def avx512vl_i16_info : AVX512VLVectorVTInfo<v32i16_info, v16i16x_info,
	v8i16x_info>;
	def avx512vl_i32_info : AVX512VLVectorVTInfo<v16i32_info, v8i32x_info,
	v4i32x_info>;
	def avx512vl_i64_info : AVX512VLVectorVTInfo<v8i64_info, v4i64x_info,
	v2i64x_info>;
	def avx512vl_f32_info : AVX512VLVectorVTInfo<v16f32_info, v8f32x_info,
	v4f32x_info>;
	def avx512vl_f64_info : AVX512VLVectorVTInfo<v8f64_info, v4f64x_info,
	v2f64x_info>;

	class X86KVectorVTInfo<RegisterClass _krc, RegisterClass _krcwm,
	ValueType _vt> {
	RegisterClass KRC = _krc;
	RegisterClass KRCWM = _krcwm;
	ValueType KVT = _vt;
	}

	def v2i1_info : X86KVectorVTInfo<VK2, VK2WM, v2i1>;
	def v4i1_info : X86KVectorVTInfo<VK4, VK4WM, v4i1>;
	def v8i1_info : X86KVectorVTInfo<VK8, VK8WM, v8i1>;
	def v16i1_info : X86KVectorVTInfo<VK16, VK16WM, v16i1>;
	def v32i1_info : X86KVectorVTInfo<VK32, VK32WM, v32i1>;
	def v64i1_info : X86KVectorVTInfo<VK64, VK64WM, v64i1>;

	// This multiclass generates the masking variants from the non-masking
	// variant. It only provides the assembly pieces for the masking variants.
	// It assumes custom ISel patterns for masking which can be provided as
	// template arguments.
	multiclass AVX512_maskable_custom<bits<8> O, Format F,
	dag Outs,
	dag Ins, dag MaskingIns, dag ZeroMaskingIns,
	string OpcodeStr,
	string AttSrcAsm, string IntelSrcAsm,
	list<dag> Pattern,
	list<dag> MaskingPattern,
	list<dag> ZeroMaskingPattern,
	string MaskingConstraint = "",
	InstrItinClass itin = NoItinerary,
	bit IsCommutable = 0,
	bit IsKCommutable = 0> {
	let isCommutable = IsCommutable in
	def NAME: AVX512<O, F, Outs, Ins,
	OpcodeStr#"\t{"#AttSrcAsm#", $dst\|"#
	"$dst, "#IntelSrcAsm#"}",
	Pattern, itin>;

	// Prefer over VMOV*rrk Pat<>
	let isCommutable = IsKCommutable in
	def NAME#k: AVX512<O, F, Outs, MaskingIns,
	OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}}\|"#
	"$dst {${mask}}, "#IntelSrcAsm#"}",
	MaskingPattern, itin>,
	EVEX_K {
	// In case of the 3src subclass this is overridden with a let.
	string Constraints = MaskingConstraint;
	}

	// Zero mask does not add any restrictions to commute operands transformation.
	// So, it is Ok to use IsCommutable instead of IsKCommutable.
	let isCommutable = IsCommutable in // Prefer over VMOV*rrkz Pat<>
	def NAME#kz: AVX512<O, F, Outs, ZeroMaskingIns,
	OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}} {z}\|"#
	"$dst {${mask}} {z}, "#IntelSrcAsm#"}",
	ZeroMaskingPattern,
	itin>,
	EVEX_KZ;
	}


	// Common base class of AVX512_maskable and AVX512_maskable_3src.
	multiclass AVX512_maskable_common<bits<8> O, Format F, X86VectorVTInfo _,
	dag Outs,
	dag Ins, dag MaskingIns, dag ZeroMaskingIns,
	string OpcodeStr,
	string AttSrcAsm, string IntelSrcAsm,
	dag RHS, dag MaskingRHS,
	SDNode Select = vselect,
	string MaskingConstraint = "",
	InstrItinClass itin = NoItinerary,
	bit IsCommutable = 0,
	bit IsKCommutable = 0> :
	AVX512_maskable_custom<O, F, Outs, Ins, MaskingIns, ZeroMaskingIns, OpcodeStr,
	AttSrcAsm, IntelSrcAsm,
	[(set _.RC:$dst, RHS)],
	[(set _.RC:$dst, MaskingRHS)],
	[(set _.RC:$dst,
	(Select _.KRCWM:$mask, RHS, _.ImmAllZerosV))],
	MaskingConstraint, NoItinerary, IsCommutable,
	IsKCommutable>;

	// Similar to AVX512_maskable_common, but with scalar types.
	multiclass AVX512_maskable_fp_common<bits<8> O, Format F, X86VectorVTInfo _,
	dag Outs,
	dag Ins, dag MaskingIns, dag ZeroMaskingIns,
	string OpcodeStr,
	string AttSrcAsm, string IntelSrcAsm,
	SDNode Select = vselect,
	string MaskingConstraint = "",
	InstrItinClass itin = NoItinerary,
	bit IsCommutable = 0,
	bit IsKCommutable = 0> :
	AVX512_maskable_custom<O, F, Outs, Ins, MaskingIns, ZeroMaskingIns, OpcodeStr,
	AttSrcAsm, IntelSrcAsm,
	[], [], [],
	MaskingConstraint, NoItinerary, IsCommutable,
	IsKCommutable>;

	// This multiclass generates the unconditional/non-masking, the masking and
	// the zero-masking variant of the vector instruction. In the masking case, the
	// perserved vector elements come from a new dummy input operand tied to $dst.
	multiclass AVX512_maskable<bits<8> O, Format F, X86VectorVTInfo _,
	dag Outs, dag Ins, string OpcodeStr,
	string AttSrcAsm, string IntelSrcAsm,
	dag RHS,
	InstrItinClass itin = NoItinerary,
	bit IsCommutable = 0, bit IsKCommutable = 0,
	SDNode Select = vselect> :
	AVX512_maskable_common<O, F, _, Outs, Ins,
	!con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
	!con((ins _.KRCWM:$mask), Ins),
	OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
	(Select _.KRCWM:$mask, RHS, _.RC:$src0), Select,
	"$src0 = $dst", itin, IsCommutable, IsKCommutable>;

	// This multiclass generates the unconditional/non-masking, the masking and
	// the zero-masking variant of the scalar instruction.
	multiclass AVX512_maskable_scalar<bits<8> O, Format F, X86VectorVTInfo _,
	dag Outs, dag Ins, string OpcodeStr,
	string AttSrcAsm, string IntelSrcAsm,
	dag RHS,
	InstrItinClass itin = NoItinerary,
	bit IsCommutable = 0> :
	AVX512_maskable_common<O, F, _, Outs, Ins,
	!con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
	!con((ins _.KRCWM:$mask), Ins),
	OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
	(X86selects _.KRCWM:$mask, RHS, _.RC:$src0),
	X86selects, "$src0 = $dst", itin, IsCommutable>;

	// Similar to AVX512_maskable but in this case one of the source operands
	// ($src1) is already tied to $dst so we just use that for the preserved
	// vector elements. NOTE that the NonTiedIns (the ins dag) should exclude
	// $src1.
	multiclass AVX512_maskable_3src<bits<8> O, Format F, X86VectorVTInfo _,
	dag Outs, dag NonTiedIns, string OpcodeStr,
	string AttSrcAsm, string IntelSrcAsm,
	dag RHS, bit IsCommutable = 0,
	bit IsKCommutable = 0> :
	AVX512_maskable_common<O, F, _, Outs,
	!con((ins _.RC:$src1), NonTiedIns),
	!con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
	!con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
	OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
	(vselect _.KRCWM:$mask, RHS, _.RC:$src1),
	vselect, "", NoItinerary, IsCommutable, IsKCommutable>;

	multiclass AVX512_maskable_3src_scalar<bits<8> O, Format F, X86VectorVTInfo _,
	dag Outs, dag NonTiedIns, string OpcodeStr,
	string AttSrcAsm, string IntelSrcAsm,
	dag RHS, bit IsCommutable = 0,
	bit IsKCommutable = 0> :
	AVX512_maskable_common<O, F, _, Outs,
	!con((ins _.RC:$src1), NonTiedIns),
	!con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
	!con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
	OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
	(X86selects _.KRCWM:$mask, RHS, _.RC:$src1),
	X86selects, "", NoItinerary, IsCommutable,
	IsKCommutable>;

	multiclass AVX512_maskable_in_asm<bits<8> O, Format F, X86VectorVTInfo _,
	dag Outs, dag Ins,
	string OpcodeStr,
	string AttSrcAsm, string IntelSrcAsm,
	list<dag> Pattern> :
	AVX512_maskable_custom<O, F, Outs, Ins,
	!con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
	!con((ins _.KRCWM:$mask), Ins),
	OpcodeStr, AttSrcAsm, IntelSrcAsm, Pattern, [], [],
	"$src0 = $dst">;


	// Instruction with mask that puts result in mask register,
	// like "compare" and "vptest"
	multiclass AVX512_maskable_custom_cmp<bits<8> O, Format F,
	dag Outs,
	dag Ins, dag MaskingIns,
	string OpcodeStr,
	string AttSrcAsm, string IntelSrcAsm,
	list<dag> Pattern,
	list<dag> MaskingPattern,
	bit IsCommutable = 0> {
	let isCommutable = IsCommutable in
	def NAME: AVX512<O, F, Outs, Ins,
	OpcodeStr#"\t{"#AttSrcAsm#", $dst\|"#
	"$dst, "#IntelSrcAsm#"}",
	Pattern, NoItinerary>;

	def NAME#k: AVX512<O, F, Outs, MaskingIns,
	OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}}\|"#
	"$dst {${mask}}, "#IntelSrcAsm#"}",
	MaskingPattern, NoItinerary>, EVEX_K;
	}

	multiclass AVX512_maskable_common_cmp<bits<8> O, Format F, X86VectorVTInfo _,
	dag Outs,
	dag Ins, dag MaskingIns,
	string OpcodeStr,
	string AttSrcAsm, string IntelSrcAsm,
	dag RHS, dag MaskingRHS,
	bit IsCommutable = 0> :
	AVX512_maskable_custom_cmp<O, F, Outs, Ins, MaskingIns, OpcodeStr,
	AttSrcAsm, IntelSrcAsm,
	[(set _.KRC:$dst, RHS)],
	[(set _.KRC:$dst, MaskingRHS)], IsCommutable>;

	multiclass AVX512_maskable_cmp<bits<8> O, Format F, X86VectorVTInfo _,
	dag Outs, dag Ins, string OpcodeStr,
	string AttSrcAsm, string IntelSrcAsm,
	dag RHS, bit IsCommutable = 0> :
	AVX512_maskable_common_cmp<O, F, _, Outs, Ins,
	!con((ins _.KRCWM:$mask), Ins),
	OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
	(and _.KRCWM:$mask, RHS), IsCommutable>;

	multiclass AVX512_maskable_cmp_alt<bits<8> O, Format F, X86VectorVTInfo _,
	dag Outs, dag Ins, string OpcodeStr,
	string AttSrcAsm, string IntelSrcAsm> :
	AVX512_maskable_custom_cmp<O, F, Outs,
	Ins, !con((ins _.KRCWM:$mask),Ins), OpcodeStr,
	AttSrcAsm, IntelSrcAsm, [],[]>;

	// This multiclass generates the unconditional/non-masking, the masking and
	// the zero-masking variant of the vector instruction. In the masking case, the
	// perserved vector elements come from a new dummy input operand tied to $dst.
	multiclass AVX512_maskable_logic<bits<8> O, Format F, X86VectorVTInfo _,
	dag Outs, dag Ins, string OpcodeStr,
	string AttSrcAsm, string IntelSrcAsm,
	dag RHS, dag MaskedRHS,
	InstrItinClass itin = NoItinerary,
	bit IsCommutable = 0, SDNode Select = vselect> :
	AVX512_maskable_custom<O, F, Outs, Ins,
	!con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
	!con((ins _.KRCWM:$mask), Ins),
	OpcodeStr, AttSrcAsm, IntelSrcAsm,
	[(set _.RC:$dst, RHS)],
	[(set _.RC:$dst,
	(Select _.KRCWM:$mask, MaskedRHS, _.RC:$src0))],
	[(set _.RC:$dst,
	(Select _.KRCWM:$mask, MaskedRHS,
	_.ImmAllZerosV))],
	"$src0 = $dst", itin, IsCommutable>;

	// Bitcasts between 512-bit vector types. Return the original type since
	// no instruction is needed for the conversion.
	def : Pat<(v8f64 (bitconvert (v8i64 VR512:$src))), (v8f64 VR512:$src)>;
	def : Pat<(v8f64 (bitconvert (v16i32 VR512:$src))), (v8f64 VR512:$src)>;
	def : Pat<(v8f64 (bitconvert (v32i16 VR512:$src))), (v8f64 VR512:$src)>;
	def : Pat<(v8f64 (bitconvert (v64i8 VR512:$src))), (v8f64 VR512:$src)>;
	def : Pat<(v8f64 (bitconvert (v16f32 VR512:$src))), (v8f64 VR512:$src)>;
	def : Pat<(v16f32 (bitconvert (v8i64 VR512:$src))), (v16f32 VR512:$src)>;
	def : Pat<(v16f32 (bitconvert (v16i32 VR512:$src))), (v16f32 VR512:$src)>;
	def : Pat<(v16f32 (bitconvert (v32i16 VR512:$src))), (v16f32 VR512:$src)>;
	def : Pat<(v16f32 (bitconvert (v64i8 VR512:$src))), (v16f32 VR512:$src)>;
	def : Pat<(v16f32 (bitconvert (v8f64 VR512:$src))), (v16f32 VR512:$src)>;
	def : Pat<(v8i64 (bitconvert (v16i32 VR512:$src))), (v8i64 VR512:$src)>;
	def : Pat<(v8i64 (bitconvert (v32i16 VR512:$src))), (v8i64 VR512:$src)>;
	def : Pat<(v8i64 (bitconvert (v64i8 VR512:$src))), (v8i64 VR512:$src)>;
	def : Pat<(v8i64 (bitconvert (v8f64 VR512:$src))), (v8i64 VR512:$src)>;
	def : Pat<(v8i64 (bitconvert (v16f32 VR512:$src))), (v8i64 VR512:$src)>;
	def : Pat<(v16i32 (bitconvert (v8i64 VR512:$src))), (v16i32 VR512:$src)>;
	def : Pat<(v16i32 (bitconvert (v16f32 VR512:$src))), (v16i32 VR512:$src)>;
	def : Pat<(v16i32 (bitconvert (v32i16 VR512:$src))), (v16i32 VR512:$src)>;
	def : Pat<(v16i32 (bitconvert (v64i8 VR512:$src))), (v16i32 VR512:$src)>;
	def : Pat<(v16i32 (bitconvert (v8f64 VR512:$src))), (v16i32 VR512:$src)>;
	def : Pat<(v32i16 (bitconvert (v8i64 VR512:$src))), (v32i16 VR512:$src)>;
	def : Pat<(v32i16 (bitconvert (v16i32 VR512:$src))), (v32i16 VR512:$src)>;
	def : Pat<(v32i16 (bitconvert (v64i8 VR512:$src))), (v32i16 VR512:$src)>;
	def : Pat<(v32i16 (bitconvert (v8f64 VR512:$src))), (v32i16 VR512:$src)>;
	def : Pat<(v32i16 (bitconvert (v16f32 VR512:$src))), (v32i16 VR512:$src)>;
	def : Pat<(v32i16 (bitconvert (v16f32 VR512:$src))), (v32i16 VR512:$src)>;
	def : Pat<(v64i8 (bitconvert (v8i64 VR512:$src))), (v64i8 VR512:$src)>;
	def : Pat<(v64i8 (bitconvert (v16i32 VR512:$src))), (v64i8 VR512:$src)>;
	def : Pat<(v64i8 (bitconvert (v32i16 VR512:$src))), (v64i8 VR512:$src)>;
	def : Pat<(v64i8 (bitconvert (v8f64 VR512:$src))), (v64i8 VR512:$src)>;
	def : Pat<(v64i8 (bitconvert (v16f32 VR512:$src))), (v64i8 VR512:$src)>;

	// Alias instruction that maps zero vector to pxor / xorp* for AVX-512.
	// This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then
	// swizzled by ExecutionDepsFix to pxor.
	// We set canFoldAsLoad because this can be converted to a constant-pool
	// load of an all-zeros value if folding it would be beneficial.
	let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
	isPseudo = 1, Predicates = [HasAVX512], SchedRW = [WriteZero] in {
	def AVX512_512_SET0 : I<0, Pseudo, (outs VR512:$dst), (ins), "",
	[(set VR512:$dst, (v16i32 immAllZerosV))]>;
	def AVX512_512_SETALLONES : I<0, Pseudo, (outs VR512:$dst), (ins), "",
	[(set VR512:$dst, (v16i32 immAllOnesV))]>;
	}

	// Alias instructions that allow VPTERNLOG to be used with a mask to create
	// a mix of all ones and all zeros elements. This is done this way to force
	// the same register to be used as input for all three sources.
	let isPseudo = 1, Predicates = [HasAVX512] in {
	def AVX512_512_SEXT_MASK_32 : I<0, Pseudo, (outs VR512:$dst),
	(ins VK16WM:$mask), "",
	[(set VR512:$dst, (vselect (v16i1 VK16WM:$mask),
	(v16i32 immAllOnesV),
	(v16i32 immAllZerosV)))]>;
	def AVX512_512_SEXT_MASK_64 : I<0, Pseudo, (outs VR512:$dst),
	(ins VK8WM:$mask), "",
	[(set VR512:$dst, (vselect (v8i1 VK8WM:$mask),
	(bc_v8i64 (v16i32 immAllOnesV)),
	(bc_v8i64 (v16i32 immAllZerosV))))]>;
	}

	let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
	isPseudo = 1, Predicates = [HasAVX512], SchedRW = [WriteZero] in {
	def AVX512_128_SET0 : I<0, Pseudo, (outs VR128X:$dst), (ins), "",
	[(set VR128X:$dst, (v4i32 immAllZerosV))]>;
	def AVX512_256_SET0 : I<0, Pseudo, (outs VR256X:$dst), (ins), "",
	[(set VR256X:$dst, (v8i32 immAllZerosV))]>;
	}

	// Alias instructions that map fld0 to xorps for sse or vxorps for avx.
	// This is expanded by ExpandPostRAPseudos.
	let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
	isPseudo = 1, SchedRW = [WriteZero], Predicates = [HasAVX512] in {
	def AVX512_FsFLD0SS : I<0, Pseudo, (outs FR32X:$dst), (ins), "",
	[(set FR32X:$dst, fp32imm0)]>;
	def AVX512_FsFLD0SD : I<0, Pseudo, (outs FR64X:$dst), (ins), "",
	[(set FR64X:$dst, fpimm0)]>;
	}

	//===----------------------------------------------------------------------===//
	// AVX-512 - VECTOR INSERT
	//
	multiclass vinsert_for_size<int Opcode, X86VectorVTInfo From, X86VectorVTInfo To,
	PatFrag vinsert_insert> {
	let ExeDomain = To.ExeDomain in {
	defm rr : AVX512_maskable<Opcode, MRMSrcReg, To, (outs To.RC:$dst),
	(ins To.RC:$src1, From.RC:$src2, u8imm:$src3),
	"vinsert" # From.EltTypeName # "x" # From.NumElts,
	"$src3, $src2, $src1", "$src1, $src2, $src3",
	(vinsert_insert:$src3 (To.VT To.RC:$src1),
	(From.VT From.RC:$src2),
	(iPTR imm))>, AVX512AIi8Base, EVEX_4V;

	defm rm : AVX512_maskable<Opcode, MRMSrcMem, To, (outs To.RC:$dst),
	(ins To.RC:$src1, From.MemOp:$src2, u8imm:$src3),
	"vinsert" # From.EltTypeName # "x" # From.NumElts,
	"$src3, $src2, $src1", "$src1, $src2, $src3",
	(vinsert_insert:$src3 (To.VT To.RC:$src1),
	(From.VT (bitconvert (From.LdFrag addr:$src2))),
	(iPTR imm))>, AVX512AIi8Base, EVEX_4V,
	EVEX_CD8<From.EltSize, From.CD8TupleForm>;
	}
	}

	multiclass vinsert_for_size_lowering<string InstrStr, X86VectorVTInfo From,
	X86VectorVTInfo To, PatFrag vinsert_insert,
	SDNodeXForm INSERT_get_vinsert_imm , list<Predicate> p> {
	let Predicates = p in {
	def : Pat<(vinsert_insert:$ins
	(To.VT To.RC:$src1), (From.VT From.RC:$src2), (iPTR imm)),
	(To.VT (!cast<Instruction>(InstrStr#"rr")
	To.RC:$src1, From.RC:$src2,
	(INSERT_get_vinsert_imm To.RC:$ins)))>;

	def : Pat<(vinsert_insert:$ins
	(To.VT To.RC:$src1),
	(From.VT (bitconvert (From.LdFrag addr:$src2))),
	(iPTR imm)),
	(To.VT (!cast<Instruction>(InstrStr#"rm")
	To.RC:$src1, addr:$src2,
	(INSERT_get_vinsert_imm To.RC:$ins)))>;
	}
	}

	multiclass vinsert_for_type<ValueType EltVT32, int Opcode128,
	ValueType EltVT64, int Opcode256> {

	let Predicates = [HasVLX] in
	defm NAME # "32x4Z256" : vinsert_for_size<Opcode128,
	X86VectorVTInfo< 4, EltVT32, VR128X>,
	X86VectorVTInfo< 8, EltVT32, VR256X>,
	vinsert128_insert>, EVEX_V256;

	defm NAME # "32x4Z" : vinsert_for_size<Opcode128,
	X86VectorVTInfo< 4, EltVT32, VR128X>,
	X86VectorVTInfo<16, EltVT32, VR512>,
	vinsert128_insert>, EVEX_V512;

	defm NAME # "64x4Z" : vinsert_for_size<Opcode256,
	X86VectorVTInfo< 4, EltVT64, VR256X>,
	X86VectorVTInfo< 8, EltVT64, VR512>,
	vinsert256_insert>, VEX_W, EVEX_V512;

	let Predicates = [HasVLX, HasDQI] in
	defm NAME # "64x2Z256" : vinsert_for_size<Opcode128,
	X86VectorVTInfo< 2, EltVT64, VR128X>,
	X86VectorVTInfo< 4, EltVT64, VR256X>,
	vinsert128_insert>, VEX_W, EVEX_V256;

	let Predicates = [HasDQI] in {
	defm NAME # "64x2Z" : vinsert_for_size<Opcode128,
	X86VectorVTInfo< 2, EltVT64, VR128X>,
	X86VectorVTInfo< 8, EltVT64, VR512>,
	vinsert128_insert>, VEX_W, EVEX_V512;

	defm NAME # "32x8Z" : vinsert_for_size<Opcode256,
	X86VectorVTInfo< 8, EltVT32, VR256X>,
	X86VectorVTInfo<16, EltVT32, VR512>,
	vinsert256_insert>, EVEX_V512;
	}
	}

	defm VINSERTF : vinsert_for_type<f32, 0x18, f64, 0x1a>;
	defm VINSERTI : vinsert_for_type<i32, 0x38, i64, 0x3a>;

	// Codegen pattern with the alternative types,
	// Only add this if 64x2 and its friends are not supported natively via AVX512DQ.
	defm : vinsert_for_size_lowering<"VINSERTF32x4Z256", v2f64x_info, v4f64x_info,
	vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX, NoDQI]>;
	defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v2i64x_info, v4i64x_info,
	vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX, NoDQI]>;

	defm : vinsert_for_size_lowering<"VINSERTF32x4Z", v2f64x_info, v8f64_info,
	vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512, NoDQI]>;
	defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v2i64x_info, v8i64_info,
	vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512, NoDQI]>;

	defm : vinsert_for_size_lowering<"VINSERTF64x4Z", v8f32x_info, v16f32_info,
	vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512, NoDQI]>;
	defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v8i32x_info, v16i32_info,
	vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512, NoDQI]>;

	// Codegen pattern with the alternative types insert VEC128 into VEC256
	defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v8i16x_info, v16i16x_info,
	vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>;
	defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v16i8x_info, v32i8x_info,
	vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>;
	// Codegen pattern with the alternative types insert VEC128 into VEC512
	defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v8i16x_info, v32i16_info,
	vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>;
	defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v16i8x_info, v64i8_info,
	vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>;
	// Codegen pattern with the alternative types insert VEC256 into VEC512
	defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v16i16x_info, v32i16_info,
	vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>;
	defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v32i8x_info, v64i8_info,
	vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>;

	// vinsertps - insert f32 to XMM
	let ExeDomain = SSEPackedSingle in {
	def VINSERTPSZrr : AVX512AIi8<0x21, MRMSrcReg, (outs VR128X:$dst),
	(ins VR128X:$src1, VR128X:$src2, u8imm:$src3),
	"vinsertps\t{$src3, $src2, $src1, $dst\|$dst, $src1, $src2, $src3}",
	[(set VR128X:$dst, (X86insertps VR128X:$src1, VR128X:$src2, imm:$src3))]>,
	EVEX_4V;
	def VINSERTPSZrm: AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst),
	(ins VR128X:$src1, f32mem:$src2, u8imm:$src3),
	"vinsertps\t{$src3, $src2, $src1, $dst\|$dst, $src1, $src2, $src3}",
	[(set VR128X:$dst, (X86insertps VR128X:$src1,
	(v4f32 (scalar_to_vector (loadf32 addr:$src2))),
	imm:$src3))]>, EVEX_4V, EVEX_CD8<32, CD8VT1>;
	}

	//===----------------------------------------------------------------------===//
	// AVX-512 VECTOR EXTRACT
	//---

	multiclass vextract_for_size<int Opcode,
	X86VectorVTInfo From, X86VectorVTInfo To,
	PatFrag vextract_extract,
	SDNodeXForm EXTRACT_get_vextract_imm> {

	let hasSideEffects = 0, ExeDomain = To.ExeDomain in {
	// use AVX512_maskable_in_asm (AVX512_maskable can't be used due to
	// vextract_extract), we interesting only in patterns without mask,
	// intrinsics pattern match generated bellow.
	defm rr : AVX512_maskable_in_asm<Opcode, MRMDestReg, To, (outs To.RC:$dst),
	(ins From.RC:$src1, u8imm:$idx),
	"vextract" # To.EltTypeName # "x" # To.NumElts,
	"$idx, $src1", "$src1, $idx",
	[(set To.RC:$dst, (vextract_extract:$idx (From.VT From.RC:$src1),
	(iPTR imm)))]>,
	AVX512AIi8Base, EVEX;
	def mr : AVX512AIi8<Opcode, MRMDestMem, (outs),
	(ins To.MemOp:$dst, From.RC:$src1, u8imm:$idx),
	"vextract" # To.EltTypeName # "x" # To.NumElts #
	"\t{$idx, $src1, $dst\|$dst, $src1, $idx}",
	[(store (To.VT (vextract_extract:$idx
	(From.VT From.RC:$src1), (iPTR imm))),
	addr:$dst)]>, EVEX;

	let mayStore = 1, hasSideEffects = 0 in
	def mrk : AVX512AIi8<Opcode, MRMDestMem, (outs),
	(ins To.MemOp:$dst, To.KRCWM:$mask,
	From.RC:$src1, u8imm:$idx),
	"vextract" # To.EltTypeName # "x" # To.NumElts #
	"\t{$idx, $src1, $dst {${mask}}\|"
	"$dst {${mask}}, $src1, $idx}",
	[]>, EVEX_K, EVEX;
	}

	def : Pat<(To.VT (vselect To.KRCWM:$mask,
	(vextract_extract:$ext (From.VT From.RC:$src1),
	(iPTR imm)),
	To.RC:$src0)),
	(!cast<Instruction>(NAME # To.EltSize # "x" # To.NumElts #
	From.ZSuffix # "rrk")
	To.RC:$src0, To.KRCWM:$mask, From.RC:$src1,
	(EXTRACT_get_vextract_imm To.RC:$ext))>;

	def : Pat<(To.VT (vselect To.KRCWM:$mask,
	(vextract_extract:$ext (From.VT From.RC:$src1),
	(iPTR imm)),
	To.ImmAllZerosV)),
	(!cast<Instruction>(NAME # To.EltSize # "x" # To.NumElts #
	From.ZSuffix # "rrkz")
	To.KRCWM:$mask, From.RC:$src1,
	(EXTRACT_get_vextract_imm To.RC:$ext))>;
	}

	// Codegen pattern for the alternative types
	multiclass vextract_for_size_lowering<string InstrStr, X86VectorVTInfo From,
	X86VectorVTInfo To, PatFrag vextract_extract,
	SDNodeXForm EXTRACT_get_vextract_imm, list<Predicate> p> {
	let Predicates = p in {
	def : Pat<(vextract_extract:$ext (From.VT From.RC:$src1), (iPTR imm)),
	(To.VT (!cast<Instruction>(InstrStr#"rr")
	From.RC:$src1,
	(EXTRACT_get_vextract_imm To.RC:$ext)))>;
	def : Pat<(store (To.VT (vextract_extract:$ext (From.VT From.RC:$src1),
	(iPTR imm))), addr:$dst),
	(!cast<Instruction>(InstrStr#"mr") addr:$dst, From.RC:$src1,
	(EXTRACT_get_vextract_imm To.RC:$ext))>;
	}
	}

	multiclass vextract_for_type<ValueType EltVT32, int Opcode128,
	ValueType EltVT64, int Opcode256> {
	defm NAME # "32x4Z" : vextract_for_size<Opcode128,
	X86VectorVTInfo<16, EltVT32, VR512>,
	X86VectorVTInfo< 4, EltVT32, VR128X>,
	vextract128_extract,
	EXTRACT_get_vextract128_imm>,
	EVEX_V512, EVEX_CD8<32, CD8VT4>;
	defm NAME # "64x4Z" : vextract_for_size<Opcode256,
	X86VectorVTInfo< 8, EltVT64, VR512>,
	X86VectorVTInfo< 4, EltVT64, VR256X>,
	vextract256_extract,
	EXTRACT_get_vextract256_imm>,
	VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT4>;
	let Predicates = [HasVLX] in
	defm NAME # "32x4Z256" : vextract_for_size<Opcode128,
	X86VectorVTInfo< 8, EltVT32, VR256X>,
	X86VectorVTInfo< 4, EltVT32, VR128X>,
	vextract128_extract,
	EXTRACT_get_vextract128_imm>,
	EVEX_V256, EVEX_CD8<32, CD8VT4>;
	let Predicates = [HasVLX, HasDQI] in
	defm NAME # "64x2Z256" : vextract_for_size<Opcode128,
	X86VectorVTInfo< 4, EltVT64, VR256X>,
	X86VectorVTInfo< 2, EltVT64, VR128X>,
	vextract128_extract,
	EXTRACT_get_vextract128_imm>,
	VEX_W, EVEX_V256, EVEX_CD8<64, CD8VT2>;
	let Predicates = [HasDQI] in {
	defm NAME # "64x2Z" : vextract_for_size<Opcode128,
	X86VectorVTInfo< 8, EltVT64, VR512>,
	X86VectorVTInfo< 2, EltVT64, VR128X>,
	vextract128_extract,
	EXTRACT_get_vextract128_imm>,
	VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT2>;
	defm NAME # "32x8Z" : vextract_for_size<Opcode256,
	X86VectorVTInfo<16, EltVT32, VR512>,
	X86VectorVTInfo< 8, EltVT32, VR256X>,
	vextract256_extract,
	EXTRACT_get_vextract256_imm>,
	EVEX_V512, EVEX_CD8<32, CD8VT8>;
	}
	}

	defm VEXTRACTF : vextract_for_type<f32, 0x19, f64, 0x1b>;
	defm VEXTRACTI : vextract_for_type<i32, 0x39, i64, 0x3b>;

	// extract_subvector codegen patterns with the alternative types.
	// Only add this if 64x2 and its friends are not supported natively via AVX512DQ.
	defm : vextract_for_size_lowering<"VEXTRACTF32x4Z", v8f64_info, v2f64x_info,
	vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512, NoDQI]>;
	defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v8i64_info, v2i64x_info,
	vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512, NoDQI]>;

	defm : vextract_for_size_lowering<"VEXTRACTF64x4Z", v16f32_info, v8f32x_info,
	vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512, NoDQI]>;
	defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v16i32_info, v8i32x_info,
	vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512, NoDQI]>;

	defm : vextract_for_size_lowering<"VEXTRACTF32x4Z256", v4f64x_info, v2f64x_info,
	vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX, NoDQI]>;
	defm : vextract_for_size_lowering<"VEXTRACTI32x4Z256", v4i64x_info, v2i64x_info,
	vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX, NoDQI]>;

	// Codegen pattern with the alternative types extract VEC128 from VEC256
	defm : vextract_for_size_lowering<"VEXTRACTI32x4Z256", v16i16x_info, v8i16x_info,
	vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX]>;
	defm : vextract_for_size_lowering<"VEXTRACTI32x4Z256", v32i8x_info, v16i8x_info,
	vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX]>;

	// Codegen pattern with the alternative types extract VEC128 from VEC512
	defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v32i16_info, v8i16x_info,
	vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>;
	defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v64i8_info, v16i8x_info,
	vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>;
	// Codegen pattern with the alternative types extract VEC256 from VEC512
	defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v32i16_info, v16i16x_info,
	vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>;
	defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v64i8_info, v32i8x_info,
	vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>;

	// A 128-bit subvector extract from the first 256-bit vector position
	// is a subregister copy that needs no instruction.
	def : Pat<(v2i64 (extract_subvector (v8i64 VR512:$src), (iPTR 0))),
	(v2i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm))>;
	def : Pat<(v2f64 (extract_subvector (v8f64 VR512:$src), (iPTR 0))),
	(v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm))>;
	def : Pat<(v4i32 (extract_subvector (v16i32 VR512:$src), (iPTR 0))),
	(v4i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm))>;
	def : Pat<(v4f32 (extract_subvector (v16f32 VR512:$src), (iPTR 0))),
	(v4f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm))>;
	def : Pat<(v8i16 (extract_subvector (v32i16 VR512:$src), (iPTR 0))),
	(v8i16 (EXTRACT_SUBREG (v32i16 VR512:$src), sub_xmm))>;
	def : Pat<(v16i8 (extract_subvector (v64i8 VR512:$src), (iPTR 0))),
	(v16i8 (EXTRACT_SUBREG (v64i8 VR512:$src), sub_xmm))>;

	// A 256-bit subvector extract from the first 256-bit vector position
	// is a subregister copy that needs no instruction.
	def : Pat<(v4i64 (extract_subvector (v8i64 VR512:$src), (iPTR 0))),
	(v4i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_ymm))>;
	def : Pat<(v4f64 (extract_subvector (v8f64 VR512:$src), (iPTR 0))),
	(v4f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_ymm))>;
	def : Pat<(v8i32 (extract_subvector (v16i32 VR512:$src), (iPTR 0))),
	(v8i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_ymm))>;
	def : Pat<(v8f32 (extract_subvector (v16f32 VR512:$src), (iPTR 0))),
	(v8f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_ymm))>;
	def : Pat<(v16i16 (extract_subvector (v32i16 VR512:$src), (iPTR 0))),
	(v16i16 (EXTRACT_SUBREG (v32i16 VR512:$src), sub_ymm))>;
	def : Pat<(v32i8 (extract_subvector (v64i8 VR512:$src), (iPTR 0))),
	(v32i8 (EXTRACT_SUBREG (v64i8 VR512:$src), sub_ymm))>;

	let AddedComplexity = 25 in { // to give priority over vinsertf128rm
	// A 128-bit subvector insert to the first 512-bit vector position
	// is a subregister copy that needs no instruction.
	def : Pat<(v8i64 (insert_subvector undef, (v2i64 VR128X:$src), (iPTR 0))),
	(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)>;
	def : Pat<(v8f64 (insert_subvector undef, (v2f64 VR128X:$src), (iPTR 0))),
	(INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)>;
	def : Pat<(v16i32 (insert_subvector undef, (v4i32 VR128X:$src), (iPTR 0))),
	(INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)>;
	def : Pat<(v16f32 (insert_subvector undef, (v4f32 VR128X:$src), (iPTR 0))),
	(INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)>;
	def : Pat<(v32i16 (insert_subvector undef, (v8i16 VR128X:$src), (iPTR 0))),
	(INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)>;
	def : Pat<(v64i8 (insert_subvector undef, (v16i8 VR128X:$src), (iPTR 0))),
	(INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)>;

	// A 256-bit subvector insert to the first 512-bit vector position
	// is a subregister copy that needs no instruction.
	def : Pat<(v8i64 (insert_subvector undef, (v4i64 VR256X:$src), (iPTR 0))),
	(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
	def : Pat<(v8f64 (insert_subvector undef, (v4f64 VR256X:$src), (iPTR 0))),
	(INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
	def : Pat<(v16i32 (insert_subvector undef, (v8i32 VR256X:$src), (iPTR 0))),
	(INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
	def : Pat<(v16f32 (insert_subvector undef, (v8f32 VR256X:$src), (iPTR 0))),
	(INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
	def : Pat<(v32i16 (insert_subvector undef, (v16i16 VR256X:$src), (iPTR 0))),
	(INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
	def : Pat<(v64i8 (insert_subvector undef, (v32i8 VR256X:$src), (iPTR 0))),
	(INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
	}

	// vextractps - extract 32 bits from XMM
	def VEXTRACTPSZrr : AVX512AIi8<0x17, MRMDestReg, (outs GR32:$dst),
	(ins VR128X:$src1, u8imm:$src2),
	"vextractps\t{$src2, $src1, $dst\|$dst, $src1, $src2}",
	[(set GR32:$dst, (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2))]>,
	EVEX;

	def VEXTRACTPSZmr : AVX512AIi8<0x17, MRMDestMem, (outs),
	(ins f32mem:$dst, VR128X:$src1, u8imm:$src2),
	"vextractps\t{$src2, $src1, $dst\|$dst, $src1, $src2}",
	[(store (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2),
	addr:$dst)]>, EVEX, EVEX_CD8<32, CD8VT1>;

	//===---------------------------------------------------------------------===//
	// AVX-512 BROADCAST
	//---
	// broadcast with a scalar argument.
	multiclass avx512_broadcast_scalar<bits<8> opc, string OpcodeStr,
	X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo> {
	def : Pat<(DestInfo.VT (X86VBroadcast SrcInfo.FRC:$src)),
	(!cast<Instruction>(NAME#DestInfo.ZSuffix#r)
	(COPY_TO_REGCLASS SrcInfo.FRC:$src, SrcInfo.RC))>;
	def : Pat<(DestInfo.VT (vselect DestInfo.KRCWM:$mask,
	(X86VBroadcast SrcInfo.FRC:$src),
	DestInfo.RC:$src0)),
	(!cast<Instruction>(NAME#DestInfo.ZSuffix#rk)
	DestInfo.RC:$src0, DestInfo.KRCWM:$mask,
	(COPY_TO_REGCLASS SrcInfo.FRC:$src, SrcInfo.RC))>;
	def : Pat<(DestInfo.VT (vselect DestInfo.KRCWM:$mask,
	(X86VBroadcast SrcInfo.FRC:$src),
	DestInfo.ImmAllZerosV)),
	(!cast<Instruction>(NAME#DestInfo.ZSuffix#rkz)
	DestInfo.KRCWM:$mask, (COPY_TO_REGCLASS SrcInfo.FRC:$src, SrcInfo.RC))>;
	}

	multiclass avx512_broadcast_rm<bits<8> opc, string OpcodeStr,
	X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo> {
	let ExeDomain = DestInfo.ExeDomain in {
	defm r : AVX512_maskable<opc, MRMSrcReg, DestInfo, (outs DestInfo.RC:$dst),
	(ins SrcInfo.RC:$src), OpcodeStr, "$src", "$src",
	(DestInfo.VT (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src)))>,
	T8PD, EVEX;
	defm m : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst),
	(ins SrcInfo.ScalarMemOp:$src), OpcodeStr, "$src", "$src",
	(DestInfo.VT (X86VBroadcast
	(SrcInfo.ScalarLdFrag addr:$src)))>,
	T8PD, EVEX, EVEX_CD8<SrcInfo.EltSize, CD8VT1>;
	}

	def : Pat<(DestInfo.VT (X86VBroadcast
	(SrcInfo.VT (scalar_to_vector
	(SrcInfo.ScalarLdFrag addr:$src))))),
	(!cast<Instruction>(NAME#DestInfo.ZSuffix#m) addr:$src)>;
	def : Pat<(DestInfo.VT (vselect DestInfo.KRCWM:$mask,
	(X86VBroadcast
	(SrcInfo.VT (scalar_to_vector
	(SrcInfo.ScalarLdFrag addr:$src)))),
	DestInfo.RC:$src0)),
	(!cast<Instruction>(NAME#DestInfo.ZSuffix#mk)
	DestInfo.RC:$src0, DestInfo.KRCWM:$mask, addr:$src)>;
	def : Pat<(DestInfo.VT (vselect DestInfo.KRCWM:$mask,
	(X86VBroadcast
	(SrcInfo.VT (scalar_to_vector
	(SrcInfo.ScalarLdFrag addr:$src)))),
	DestInfo.ImmAllZerosV)),
	(!cast<Instruction>(NAME#DestInfo.ZSuffix#mkz)
	DestInfo.KRCWM:$mask, addr:$src)>;
	}

	multiclass avx512_fp_broadcast_sd<bits<8> opc, string OpcodeStr,
	AVX512VLVectorVTInfo _> {
	let Predicates = [HasAVX512] in
	defm Z : avx512_broadcast_rm<opc, OpcodeStr, _.info512, _.info128>,
	avx512_broadcast_scalar<opc, OpcodeStr, _.info512, _.info128>,
	EVEX_V512;

	let Predicates = [HasVLX] in {
	defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, _.info256, _.info128>,
	avx512_broadcast_scalar<opc, OpcodeStr, _.info256, _.info128>,
	EVEX_V256;
	}
	}

	multiclass avx512_fp_broadcast_ss<bits<8> opc, string OpcodeStr,
	AVX512VLVectorVTInfo _> {
	let Predicates = [HasAVX512] in
	defm Z : avx512_broadcast_rm<opc, OpcodeStr, _.info512, _.info128>,
	avx512_broadcast_scalar<opc, OpcodeStr, _.info512, _.info128>,
	EVEX_V512;

	let Predicates = [HasVLX] in {
	defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, _.info256, _.info128>,
	avx512_broadcast_scalar<opc, OpcodeStr, _.info256, _.info128>,
	EVEX_V256;
	defm Z128 : avx512_broadcast_rm<opc, OpcodeStr, _.info128, _.info128>,
	avx512_broadcast_scalar<opc, OpcodeStr, _.info128, _.info128>,
	EVEX_V128;
	}
	}
	defm VBROADCASTSS : avx512_fp_broadcast_ss<0x18, "vbroadcastss",
	avx512vl_f32_info>;
	defm VBROADCASTSD : avx512_fp_broadcast_sd<0x19, "vbroadcastsd",
	avx512vl_f64_info>, VEX_W;

	def : Pat<(int_x86_avx512_vbroadcast_ss_512 addr:$src),
	(VBROADCASTSSZm addr:$src)>;
	def : Pat<(int_x86_avx512_vbroadcast_sd_512 addr:$src),
	(VBROADCASTSDZm addr:$src)>;

	multiclass avx512_int_broadcast_reg<bits<8> opc, X86VectorVTInfo _,
	SDPatternOperator OpNode,
	RegisterClass SrcRC> {
	let ExeDomain = _.ExeDomain in
	defm r : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
	(ins SrcRC:$src),
	"vpbroadcast"##_.Suffix, "$src", "$src",
	(_.VT (OpNode SrcRC:$src))>, T8PD, EVEX;
	}

	+multiclass avx512_int_broadcastbw_reg<bits<8> opc, string Name,
	+ X86VectorVTInfo _, SDPatternOperator OpNode,
	+ RegisterClass SrcRC, SubRegIndex Subreg> {
	+ let ExeDomain = _.ExeDomain in
	+ defm r : AVX512_maskable_custom<opc, MRMSrcReg,
	+ (outs _.RC:$dst), (ins GR32:$src),
	+ !con((ins _.RC:$src0, _.KRCWM:$mask), (ins GR32:$src)),
	+ !con((ins _.KRCWM:$mask), (ins GR32:$src)),
	+ "vpbroadcast"##_.Suffix, "$src", "$src", [], [], [],
	+ "$src0 = $dst">, T8PD, EVEX;
	+
	+ def : Pat <(_.VT (OpNode SrcRC:$src)),
	+ (!cast<Instruction>(Name#r)
	+ (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), SrcRC:$src, Subreg)))>;
	+
	+ def : Pat <(vselect _.KRCWM:$mask, (_.VT (OpNode SrcRC:$src)), _.RC:$src0),
	+ (!cast<Instruction>(Name#rk) _.RC:$src0, _.KRCWM:$mask,
	+ (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), SrcRC:$src, Subreg)))>;
	+
	+ def : Pat <(vselect _.KRCWM:$mask, (_.VT (OpNode SrcRC:$src)), _.ImmAllZerosV),
	+ (!cast<Instruction>(Name#rkz) _.KRCWM:$mask,
	+ (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), SrcRC:$src, Subreg)))>;
	+}
	+
	+multiclass avx512_int_broadcastbw_reg_vl<bits<8> opc, string Name,
	+ AVX512VLVectorVTInfo _, SDPatternOperator OpNode,
	+ RegisterClass SrcRC, SubRegIndex Subreg, Predicate prd> {
	+ let Predicates = [prd] in
	+ defm Z : avx512_int_broadcastbw_reg<opc, Name#Z, _.info512, OpNode, SrcRC,
	+ Subreg>, EVEX_V512;
	+ let Predicates = [prd, HasVLX] in {
	+ defm Z256 : avx512_int_broadcastbw_reg<opc, Name#Z256, _.info256, OpNode,
	+ SrcRC, Subreg>, EVEX_V256;
	+ defm Z128 : avx512_int_broadcastbw_reg<opc, Name#Z128, _.info128, OpNode,
	+ SrcRC, Subreg>, EVEX_V128;
	+ }
	+}
	+
	multiclass avx512_int_broadcast_reg_vl<bits<8> opc, AVX512VLVectorVTInfo _,
	SDPatternOperator OpNode,
	RegisterClass SrcRC, Predicate prd> {
	let Predicates = [prd] in
	defm Z : avx512_int_broadcast_reg<opc, _.info512, OpNode, SrcRC>, EVEX_V512;
	let Predicates = [prd, HasVLX] in {
	defm Z256 : avx512_int_broadcast_reg<opc, _.info256, OpNode, SrcRC>, EVEX_V256;
	defm Z128 : avx512_int_broadcast_reg<opc, _.info128, OpNode, SrcRC>, EVEX_V128;
	}
	}

	-let isCodeGenOnly = 1 in {
	-defm VPBROADCASTBr : avx512_int_broadcast_reg_vl<0x7A, avx512vl_i8_info,
	- X86VBroadcast, GR8, HasBWI>;
	-defm VPBROADCASTWr : avx512_int_broadcast_reg_vl<0x7B, avx512vl_i16_info,
	- X86VBroadcast, GR16, HasBWI>;
	-}
	-let isAsmParserOnly = 1 in {
	- defm VPBROADCASTBr_Alt : avx512_int_broadcast_reg_vl<0x7A, avx512vl_i8_info,
	- null_frag, GR32, HasBWI>;
	- defm VPBROADCASTWr_Alt : avx512_int_broadcast_reg_vl<0x7B, avx512vl_i16_info,
	- null_frag, GR32, HasBWI>;
	-}
	+defm VPBROADCASTBr : avx512_int_broadcastbw_reg_vl<0x7A, "VPBROADCASTBr",
	+ avx512vl_i8_info, X86VBroadcast, GR8, sub_8bit, HasBWI>;
	+defm VPBROADCASTWr : avx512_int_broadcastbw_reg_vl<0x7B, "VPBROADCASTWr",
	+ avx512vl_i16_info, X86VBroadcast, GR16, sub_16bit,
	+ HasBWI>;
	defm VPBROADCASTDr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i32_info,
	X86VBroadcast, GR32, HasAVX512>;
	defm VPBROADCASTQr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i64_info,
	X86VBroadcast, GR64, HasAVX512>, VEX_W;

	def : Pat <(v16i32 (X86vzext VK16WM:$mask)),
	(VPBROADCASTDrZrkz VK16WM:$mask, (i32 (MOV32ri 0x1)))>;
	def : Pat <(v8i64 (X86vzext VK8WM:$mask)),
	(VPBROADCASTQrZrkz VK8WM:$mask, (i64 (MOV64ri 0x1)))>;

	// Provide aliases for broadcast from the same register class that
	// automatically does the extract.
	multiclass avx512_int_broadcast_rm_lowering<X86VectorVTInfo DestInfo,
	X86VectorVTInfo SrcInfo> {
	def : Pat<(DestInfo.VT (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src))),
	(!cast<Instruction>(NAME#DestInfo.ZSuffix#"r")
	(EXTRACT_SUBREG (SrcInfo.VT SrcInfo.RC:$src), sub_xmm))>;
	}

	multiclass avx512_int_broadcast_rm_vl<bits<8> opc, string OpcodeStr,
	AVX512VLVectorVTInfo _, Predicate prd> {
	let Predicates = [prd] in {
	defm Z : avx512_broadcast_rm<opc, OpcodeStr, _.info512, _.info128>,
	avx512_int_broadcast_rm_lowering<_.info512, _.info256>,
	EVEX_V512;
	// Defined separately to avoid redefinition.
	defm Z_Alt : avx512_int_broadcast_rm_lowering<_.info512, _.info512>;
	}
	let Predicates = [prd, HasVLX] in {
	defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, _.info256, _.info128>,
	avx512_int_broadcast_rm_lowering<_.info256, _.info256>,
	EVEX_V256;
	defm Z128 : avx512_broadcast_rm<opc, OpcodeStr, _.info128, _.info128>,
	EVEX_V128;
	}
	}

	defm VPBROADCASTB : avx512_int_broadcast_rm_vl<0x78, "vpbroadcastb",
	avx512vl_i8_info, HasBWI>;
	defm VPBROADCASTW : avx512_int_broadcast_rm_vl<0x79, "vpbroadcastw",
	avx512vl_i16_info, HasBWI>;
	defm VPBROADCASTD : avx512_int_broadcast_rm_vl<0x58, "vpbroadcastd",
	avx512vl_i32_info, HasAVX512>;
	defm VPBROADCASTQ : avx512_int_broadcast_rm_vl<0x59, "vpbroadcastq",
	avx512vl_i64_info, HasAVX512>, VEX_W;

	multiclass avx512_subvec_broadcast_rm<bits<8> opc, string OpcodeStr,
	X86VectorVTInfo _Dst, X86VectorVTInfo _Src> {
	defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
	(ins _Src.MemOp:$src), OpcodeStr, "$src", "$src",
	(_Dst.VT (X86SubVBroadcast
	(_Src.VT (bitconvert (_Src.LdFrag addr:$src)))))>,
	AVX5128IBase, EVEX;
	}

	let Predicates = [HasAVX512] in {
	// 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD.
	def : Pat<(v8i64 (X86VBroadcast (v8i64 (X86vzload addr:$src)))),
	(VPBROADCASTQZm addr:$src)>;
	}

	let Predicates = [HasVLX, HasBWI] in {
	// 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD.
	def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload addr:$src)))),
	(VPBROADCASTQZ128m addr:$src)>;
	def : Pat<(v4i64 (X86VBroadcast (v4i64 (X86vzload addr:$src)))),
	(VPBROADCASTQZ256m addr:$src)>;
	// loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
	// This means we'll encounter truncated i32 loads; match that here.
	def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
	(VPBROADCASTWZ128m addr:$src)>;
	def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
	(VPBROADCASTWZ256m addr:$src)>;
	def : Pat<(v8i16 (X86VBroadcast
	(i16 (trunc (i32 (zextloadi16 addr:$src)))))),
	(VPBROADCASTWZ128m addr:$src)>;
	def : Pat<(v16i16 (X86VBroadcast
	(i16 (trunc (i32 (zextloadi16 addr:$src)))))),
	(VPBROADCASTWZ256m addr:$src)>;
	}

	//===----------------------------------------------------------------------===//
	// AVX-512 BROADCAST SUBVECTORS
	//

	defm VBROADCASTI32X4 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti32x4",
	v16i32_info, v4i32x_info>,
	EVEX_V512, EVEX_CD8<32, CD8VT4>;
	defm VBROADCASTF32X4 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf32x4",
	v16f32_info, v4f32x_info>,
	EVEX_V512, EVEX_CD8<32, CD8VT4>;
	defm VBROADCASTI64X4 : avx512_subvec_broadcast_rm<0x5b, "vbroadcasti64x4",
	v8i64_info, v4i64x_info>, VEX_W,
	EVEX_V512, EVEX_CD8<64, CD8VT4>;
	defm VBROADCASTF64X4 : avx512_subvec_broadcast_rm<0x1b, "vbroadcastf64x4",
	v8f64_info, v4f64x_info>, VEX_W,
	EVEX_V512, EVEX_CD8<64, CD8VT4>;

	let Predicates = [HasAVX512] in {
	def : Pat<(v32i16 (X86SubVBroadcast (bc_v16i16 (loadv4i64 addr:$src)))),
	(VBROADCASTI64X4rm addr:$src)>;
	def : Pat<(v64i8 (X86SubVBroadcast (bc_v32i8 (loadv4i64 addr:$src)))),
	(VBROADCASTI64X4rm addr:$src)>;

	// Provide fallback in case the load node that is used in the patterns above
	// is used by additional users, which prevents the pattern selection.
	def : Pat<(v8f64 (X86SubVBroadcast (v4f64 VR256X:$src))),
	(VINSERTF64x4Zrr (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
	(v4f64 VR256X:$src), 1)>;
	def : Pat<(v8i64 (X86SubVBroadcast (v4i64 VR256X:$src))),
	(VINSERTI64x4Zrr (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
	(v4i64 VR256X:$src), 1)>;
	def : Pat<(v32i16 (X86SubVBroadcast (v16i16 VR256X:$src))),
	(VINSERTI64x4Zrr (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
	(v16i16 VR256X:$src), 1)>;
	def : Pat<(v64i8 (X86SubVBroadcast (v32i8 VR256X:$src))),
	(VINSERTI64x4Zrr (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
	(v32i8 VR256X:$src), 1)>;

	def : Pat<(v32i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))),
	(VBROADCASTI32X4rm addr:$src)>;
	def : Pat<(v64i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))),
	(VBROADCASTI32X4rm addr:$src)>;
	}

	let Predicates = [HasVLX] in {
	defm VBROADCASTI32X4Z256 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti32x4",
	v8i32x_info, v4i32x_info>,
	EVEX_V256, EVEX_CD8<32, CD8VT4>;
	defm VBROADCASTF32X4Z256 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf32x4",
	v8f32x_info, v4f32x_info>,
	EVEX_V256, EVEX_CD8<32, CD8VT4>;

	def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))),
	(VBROADCASTI32X4Z256rm addr:$src)>;
	def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))),
	(VBROADCASTI32X4Z256rm addr:$src)>;

	// Provide fallback in case the load node that is used in the patterns above
	// is used by additional users, which prevents the pattern selection.
	def : Pat<(v8f32 (X86SubVBroadcast (v4f32 VR128X:$src))),
	(VINSERTF32x4Z256rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
	(v4f32 VR128X:$src), 1)>;
	def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128X:$src))),
	(VINSERTI32x4Z256rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
	(v4i32 VR128X:$src), 1)>;
	def : Pat<(v16i16 (X86SubVBroadcast (v8i16 VR128X:$src))),
	(VINSERTI32x4Z256rr (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
	(v8i16 VR128X:$src), 1)>;
	def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128X:$src))),
	(VINSERTI32x4Z256rr (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
	(v16i8 VR128X:$src), 1)>;
	}

	let Predicates = [HasVLX, HasDQI] in {
	defm VBROADCASTI64X2Z128 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti64x2",
	v4i64x_info, v2i64x_info>, VEX_W,
	EVEX_V256, EVEX_CD8<64, CD8VT2>;
	defm VBROADCASTF64X2Z128 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf64x2",
	v4f64x_info, v2f64x_info>, VEX_W,
	EVEX_V256, EVEX_CD8<64, CD8VT2>;

	// Provide fallback in case the load node that is used in the patterns above
	// is used by additional users, which prevents the pattern selection.
	def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128X:$src))),
	(VINSERTF64x2Z256rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
	(v2f64 VR128X:$src), 1)>;
	def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128X:$src))),
	(VINSERTI64x2Z256rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
	(v2i64 VR128X:$src), 1)>;
	}

	let Predicates = [HasVLX, NoDQI] in {
	def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))),
	(VBROADCASTF32X4Z256rm addr:$src)>;
	def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
	(VBROADCASTI32X4Z256rm addr:$src)>;

	// Provide fallback in case the load node that is used in the patterns above
	// is used by additional users, which prevents the pattern selection.
	def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128X:$src))),
	(VINSERTF32x4Z256rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
	(v2f64 VR128X:$src), 1)>;
	def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128X:$src))),
	(VINSERTI32x4Z256rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
	(v2i64 VR128X:$src), 1)>;
	}

	let Predicates = [HasAVX512, NoDQI] in {
	def : Pat<(v8f64 (X86SubVBroadcast (loadv2f64 addr:$src))),
	(VBROADCASTF32X4rm addr:$src)>;
	def : Pat<(v8i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
	(VBROADCASTI32X4rm addr:$src)>;

	def : Pat<(v16f32 (X86SubVBroadcast (loadv8f32 addr:$src))),
	(VBROADCASTF64X4rm addr:$src)>;
	def : Pat<(v16i32 (X86SubVBroadcast (bc_v8i32 (loadv4i64 addr:$src)))),
	(VBROADCASTI64X4rm addr:$src)>;

	// Provide fallback in case the load node that is used in the patterns above
	// is used by additional users, which prevents the pattern selection.
	def : Pat<(v16f32 (X86SubVBroadcast (v8f32 VR256X:$src))),
	(VINSERTF64x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
	(v8f32 VR256X:$src), 1)>;
	def : Pat<(v16i32 (X86SubVBroadcast (v8i32 VR256X:$src))),
	(VINSERTI64x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
	(v8i32 VR256X:$src), 1)>;
	}

	let Predicates = [HasDQI] in {
	defm VBROADCASTI64X2 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti64x2",
	v8i64_info, v2i64x_info>, VEX_W,
	EVEX_V512, EVEX_CD8<64, CD8VT2>;
	defm VBROADCASTI32X8 : avx512_subvec_broadcast_rm<0x5b, "vbroadcasti32x8",
	v16i32_info, v8i32x_info>,
	EVEX_V512, EVEX_CD8<32, CD8VT8>;
	defm VBROADCASTF64X2 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf64x2",
	v8f64_info, v2f64x_info>, VEX_W,
	EVEX_V512, EVEX_CD8<64, CD8VT2>;
	defm VBROADCASTF32X8 : avx512_subvec_broadcast_rm<0x1b, "vbroadcastf32x8",
	v16f32_info, v8f32x_info>,
	EVEX_V512, EVEX_CD8<32, CD8VT8>;

	// Provide fallback in case the load node that is used in the patterns above
	// is used by additional users, which prevents the pattern selection.
	def : Pat<(v16f32 (X86SubVBroadcast (v8f32 VR256X:$src))),
	(VINSERTF32x8Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
	(v8f32 VR256X:$src), 1)>;
	def : Pat<(v16i32 (X86SubVBroadcast (v8i32 VR256X:$src))),
	(VINSERTI32x8Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
	(v8i32 VR256X:$src), 1)>;
	}

	multiclass avx512_common_broadcast_32x2<bits<8> opc, string OpcodeStr,
	AVX512VLVectorVTInfo _Dst, AVX512VLVectorVTInfo _Src> {
	let Predicates = [HasDQI] in
	defm Z : avx512_broadcast_rm<opc, OpcodeStr, _Dst.info512, _Src.info128>,
	EVEX_V512;
	let Predicates = [HasDQI, HasVLX] in
	defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, _Dst.info256, _Src.info128>,
	EVEX_V256;
	}

	multiclass avx512_common_broadcast_i32x2<bits<8> opc, string OpcodeStr,
	AVX512VLVectorVTInfo _Dst, AVX512VLVectorVTInfo _Src> :
	avx512_common_broadcast_32x2<opc, OpcodeStr, _Dst, _Src> {

	let Predicates = [HasDQI, HasVLX] in
	defm Z128 : avx512_broadcast_rm<opc, OpcodeStr, _Dst.info128, _Src.info128>,
	EVEX_V128;
	}

	defm VBROADCASTI32X2 : avx512_common_broadcast_i32x2<0x59, "vbroadcasti32x2",
	avx512vl_i32_info, avx512vl_i64_info>;
	defm VBROADCASTF32X2 : avx512_common_broadcast_32x2<0x19, "vbroadcastf32x2",
	avx512vl_f32_info, avx512vl_f64_info>;

	let Predicates = [HasVLX] in {
	def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256X:$src))),
	(VBROADCASTSSZ256r (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm))>;
	def : Pat<(v4f64 (X86VBroadcast (v4f64 VR256X:$src))),
	(VBROADCASTSDZ256r (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm))>;
	}

	def : Pat<(v16f32 (X86VBroadcast (v16f32 VR512:$src))),
	(VBROADCASTSSZr (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm))>;
	def : Pat<(v16f32 (X86VBroadcast (v8f32 VR256X:$src))),
	(VBROADCASTSSZr (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm))>;

	def : Pat<(v8f64 (X86VBroadcast (v8f64 VR512:$src))),
	(VBROADCASTSDZr (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm))>;
	def : Pat<(v8f64 (X86VBroadcast (v4f64 VR256X:$src))),
	(VBROADCASTSDZr (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm))>;

	//===----------------------------------------------------------------------===//
	// AVX-512 BROADCAST MASK TO VECTOR REGISTER
	//---
	multiclass avx512_mask_broadcastm<bits<8> opc, string OpcodeStr,
	X86VectorVTInfo _, RegisterClass KRC> {
	def rr : AVX512XS8I<opc, MRMSrcReg, (outs _.RC:$dst), (ins KRC:$src),
	!strconcat(OpcodeStr, "\t{$src, $dst\|$dst, $src}"),
	[(set _.RC:$dst, (_.VT (X86VBroadcastm KRC:$src)))]>, EVEX;
	}

	multiclass avx512_mask_broadcast<bits<8> opc, string OpcodeStr,
	AVX512VLVectorVTInfo VTInfo, RegisterClass KRC> {
	let Predicates = [HasCDI] in
	defm Z : avx512_mask_broadcastm<opc, OpcodeStr, VTInfo.info512, KRC>, EVEX_V512;
	let Predicates = [HasCDI, HasVLX] in {
	defm Z256 : avx512_mask_broadcastm<opc, OpcodeStr, VTInfo.info256, KRC>, EVEX_V256;
	defm Z128 : avx512_mask_broadcastm<opc, OpcodeStr, VTInfo.info128, KRC>, EVEX_V128;
	}
	}

	defm VPBROADCASTMW2D : avx512_mask_broadcast<0x3A, "vpbroadcastmw2d",
	avx512vl_i32_info, VK16>;
	defm VPBROADCASTMB2Q : avx512_mask_broadcast<0x2A, "vpbroadcastmb2q",
	avx512vl_i64_info, VK8>, VEX_W;

	//===----------------------------------------------------------------------===//
	// -- VPERMI2 - 3 source operands form --
	multiclass avx512_perm_i<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
	let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
	// The index operand in the pattern should really be an integer type. However,
	// if we do that and it happens to come from a bitcast, then it becomes
	// difficult to find the bitcast needed to convert the index to the
	// destination type for the passthru since it will be folded with the bitcast
	// of the index operand.
	defm rr: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
	(ins _.RC:$src2, _.RC:$src3),
	OpcodeStr, "$src3, $src2", "$src2, $src3",
	(_.VT (X86VPermi2X _.RC:$src1, _.RC:$src2, _.RC:$src3)), 1>, EVEX_4V,
	AVX5128IBase;

	defm rm: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
	(ins _.RC:$src2, _.MemOp:$src3),
	OpcodeStr, "$src3, $src2", "$src2, $src3",
	(_.VT (X86VPermi2X _.RC:$src1, _.RC:$src2,
	(_.VT (bitconvert (_.LdFrag addr:$src3))))), 1>,
	EVEX_4V, AVX5128IBase;
	}
	}
	multiclass avx512_perm_i_mb<bits<8> opc, string OpcodeStr,
	X86VectorVTInfo _> {
	let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in
	defm rmb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
	(ins _.RC:$src2, _.ScalarMemOp:$src3),
	OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"),
	!strconcat("$src2, ${src3}", _.BroadcastStr ),
	(_.VT (X86VPermi2X _.RC:$src1,
	_.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))),
	1>, AVX5128IBase, EVEX_4V, EVEX_B;
	}

	multiclass avx512_perm_i_sizes<bits<8> opc, string OpcodeStr,
	AVX512VLVectorVTInfo VTInfo> {
	defm NAME: avx512_perm_i<opc, OpcodeStr, VTInfo.info512>,
	avx512_perm_i_mb<opc, OpcodeStr, VTInfo.info512>, EVEX_V512;
	let Predicates = [HasVLX] in {
	defm NAME#128: avx512_perm_i<opc, OpcodeStr, VTInfo.info128>,
	avx512_perm_i_mb<opc, OpcodeStr, VTInfo.info128>, EVEX_V128;
	defm NAME#256: avx512_perm_i<opc, OpcodeStr, VTInfo.info256>,
	avx512_perm_i_mb<opc, OpcodeStr, VTInfo.info256>, EVEX_V256;
	}
	}

	multiclass avx512_perm_i_sizes_bw<bits<8> opc, string OpcodeStr,
	AVX512VLVectorVTInfo VTInfo,
	Predicate Prd> {
	let Predicates = [Prd] in
	defm NAME: avx512_perm_i<opc, OpcodeStr, VTInfo.info512>, EVEX_V512;
	let Predicates = [Prd, HasVLX] in {
	defm NAME#128: avx512_perm_i<opc, OpcodeStr, VTInfo.info128>, EVEX_V128;
	defm NAME#256: avx512_perm_i<opc, OpcodeStr, VTInfo.info256>, EVEX_V256;
	}
	}

	defm VPERMI2D : avx512_perm_i_sizes<0x76, "vpermi2d",
	avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
	defm VPERMI2Q : avx512_perm_i_sizes<0x76, "vpermi2q",
	avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
	defm VPERMI2W : avx512_perm_i_sizes_bw<0x75, "vpermi2w",
	avx512vl_i16_info, HasBWI>,
	VEX_W, EVEX_CD8<16, CD8VF>;
	defm VPERMI2B : avx512_perm_i_sizes_bw<0x75, "vpermi2b",
	avx512vl_i8_info, HasVBMI>,
	EVEX_CD8<8, CD8VF>;
	defm VPERMI2PS : avx512_perm_i_sizes<0x77, "vpermi2ps",
	avx512vl_f32_info>, EVEX_CD8<32, CD8VF>;
	defm VPERMI2PD : avx512_perm_i_sizes<0x77, "vpermi2pd",
	avx512vl_f64_info>, VEX_W, EVEX_CD8<64, CD8VF>;

	// VPERMT2
	multiclass avx512_perm_t<bits<8> opc, string OpcodeStr,
	X86VectorVTInfo _, X86VectorVTInfo IdxVT> {
	let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
	defm rr: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
	(ins IdxVT.RC:$src2, _.RC:$src3),
	OpcodeStr, "$src3, $src2", "$src2, $src3",
	(_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2, _.RC:$src3)), 1>,
	EVEX_4V, AVX5128IBase;

	defm rm: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
	(ins IdxVT.RC:$src2, _.MemOp:$src3),
	OpcodeStr, "$src3, $src2", "$src2, $src3",
	(_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2,
	(bitconvert (_.LdFrag addr:$src3)))), 1>,
	EVEX_4V, AVX5128IBase;
	}
	}
	multiclass avx512_perm_t_mb<bits<8> opc, string OpcodeStr,
	X86VectorVTInfo _, X86VectorVTInfo IdxVT> {
	let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in
	defm rmb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
	(ins IdxVT.RC:$src2, _.ScalarMemOp:$src3),
	OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"),
	!strconcat("$src2, ${src3}", _.BroadcastStr ),
	(_.VT (X86VPermt2 _.RC:$src1,
	IdxVT.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))),
	1>, AVX5128IBase, EVEX_4V, EVEX_B;
	}

	multiclass avx512_perm_t_sizes<bits<8> opc, string OpcodeStr,
	AVX512VLVectorVTInfo VTInfo,
	AVX512VLVectorVTInfo ShuffleMask> {
	defm NAME: avx512_perm_t<opc, OpcodeStr, VTInfo.info512,
	ShuffleMask.info512>,
	avx512_perm_t_mb<opc, OpcodeStr, VTInfo.info512,
	ShuffleMask.info512>, EVEX_V512;
	let Predicates = [HasVLX] in {
	defm NAME#128: avx512_perm_t<opc, OpcodeStr, VTInfo.info128,
	ShuffleMask.info128>,
	avx512_perm_t_mb<opc, OpcodeStr, VTInfo.info128,
	ShuffleMask.info128>, EVEX_V128;
	defm NAME#256: avx512_perm_t<opc, OpcodeStr, VTInfo.info256,
	ShuffleMask.info256>,
	avx512_perm_t_mb<opc, OpcodeStr, VTInfo.info256,
	ShuffleMask.info256>, EVEX_V256;
	}
	}

	multiclass avx512_perm_t_sizes_bw<bits<8> opc, string OpcodeStr,
	AVX512VLVectorVTInfo VTInfo,
	AVX512VLVectorVTInfo Idx,
	Predicate Prd> {
	let Predicates = [Prd] in
	defm NAME: avx512_perm_t<opc, OpcodeStr, VTInfo.info512,
	Idx.info512>, EVEX_V512;
	let Predicates = [Prd, HasVLX] in {
	defm NAME#128: avx512_perm_t<opc, OpcodeStr, VTInfo.info128,
	Idx.info128>, EVEX_V128;
	defm NAME#256: avx512_perm_t<opc, OpcodeStr, VTInfo.info256,
	Idx.info256>, EVEX_V256;
	}
	}

	defm VPERMT2D : avx512_perm_t_sizes<0x7E, "vpermt2d",
	avx512vl_i32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
	defm VPERMT2Q : avx512_perm_t_sizes<0x7E, "vpermt2q",
	avx512vl_i64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
	defm VPERMT2W : avx512_perm_t_sizes_bw<0x7D, "vpermt2w",
	avx512vl_i16_info, avx512vl_i16_info, HasBWI>,
	VEX_W, EVEX_CD8<16, CD8VF>;
	defm VPERMT2B : avx512_perm_t_sizes_bw<0x7D, "vpermt2b",
	avx512vl_i8_info, avx512vl_i8_info, HasVBMI>,
	EVEX_CD8<8, CD8VF>;
	defm VPERMT2PS : avx512_perm_t_sizes<0x7F, "vpermt2ps",
	avx512vl_f32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
	defm VPERMT2PD : avx512_perm_t_sizes<0x7F, "vpermt2pd",
	avx512vl_f64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;

	//===----------------------------------------------------------------------===//
	// AVX-512 - BLEND using mask
	//
	multiclass avx512_blendmask<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
	let ExeDomain = _.ExeDomain, hasSideEffects = 0 in {
	def rr : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst),
	(ins _.RC:$src1, _.RC:$src2),
	!strconcat(OpcodeStr,
	"\t{$src2, $src1, ${dst}\|${dst}, $src1, $src2}"),
	[]>, EVEX_4V;
	def rrk : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst),
	(ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
	!strconcat(OpcodeStr,
	"\t{$src2, $src1, ${dst} {${mask}}\|${dst} {${mask}}, $src1, $src2}"),
	[]>, EVEX_4V, EVEX_K;
	def rrkz : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst),
	(ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
	!strconcat(OpcodeStr,
	"\t{$src2, $src1, ${dst} {${mask}} {z}\|${dst} {${mask}} {z}, $src1, $src2}"),
	[]>, EVEX_4V, EVEX_KZ;
	let mayLoad = 1 in {
	def rm : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
	(ins _.RC:$src1, _.MemOp:$src2),
	!strconcat(OpcodeStr,
	"\t{$src2, $src1, ${dst}\|${dst}, $src1, $src2}"),
	[]>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
	def rmk : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
	(ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2),
	!strconcat(OpcodeStr,
	"\t{$src2, $src1, ${dst} {${mask}}\|${dst} {${mask}}, $src1, $src2}"),
	[]>, EVEX_4V, EVEX_K, EVEX_CD8<_.EltSize, CD8VF>;
	def rmkz : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
	(ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2),
	!strconcat(OpcodeStr,
	"\t{$src2, $src1, ${dst} {${mask}} {z}\|${dst} {${mask}} {z}, $src1, $src2}"),
	[]>, EVEX_4V, EVEX_KZ, EVEX_CD8<_.EltSize, CD8VF>;
	}
	}
	}
	multiclass avx512_blendmask_rmb<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {

	let mayLoad = 1, hasSideEffects = 0 in {
	def rmbk : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
	(ins _.KRCWM:$mask, _.RC:$src1, _.ScalarMemOp:$src2),
	!strconcat(OpcodeStr,
	"\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}\|",
	"$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"),
	[]>, EVEX_4V, EVEX_K, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>;

	def rmb : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
	(ins _.RC:$src1, _.ScalarMemOp:$src2),
	!strconcat(OpcodeStr,
	"\t{${src2}", _.BroadcastStr, ", $src1, $dst\|",
	"$dst, $src1, ${src2}", _.BroadcastStr, "}"),
	[]>, EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>;
	}
	}

	multiclass blendmask_dq <bits<8> opc, string OpcodeStr,
	AVX512VLVectorVTInfo VTInfo> {
	defm Z : avx512_blendmask <opc, OpcodeStr, VTInfo.info512>,
	avx512_blendmask_rmb <opc, OpcodeStr, VTInfo.info512>, EVEX_V512;

	let Predicates = [HasVLX] in {
	defm Z256 : avx512_blendmask<opc, OpcodeStr, VTInfo.info256>,
	avx512_blendmask_rmb <opc, OpcodeStr, VTInfo.info256>, EVEX_V256;
	defm Z128 : avx512_blendmask<opc, OpcodeStr, VTInfo.info128>,
	avx512_blendmask_rmb <opc, OpcodeStr, VTInfo.info128>, EVEX_V128;
	}
	}

	multiclass blendmask_bw <bits<8> opc, string OpcodeStr,
	AVX512VLVectorVTInfo VTInfo> {
	let Predicates = [HasBWI] in
	defm Z : avx512_blendmask <opc, OpcodeStr, VTInfo.info512>, EVEX_V512;

	let Predicates = [HasBWI, HasVLX] in {
	defm Z256 : avx512_blendmask <opc, OpcodeStr, VTInfo.info256>, EVEX_V256;
	defm Z128 : avx512_blendmask <opc, OpcodeStr, VTInfo.info128>, EVEX_V128;
	}
	}


	defm VBLENDMPS : blendmask_dq <0x65, "vblendmps", avx512vl_f32_info>;
	defm VBLENDMPD : blendmask_dq <0x65, "vblendmpd", avx512vl_f64_info>, VEX_W;
	defm VPBLENDMD : blendmask_dq <0x64, "vpblendmd", avx512vl_i32_info>;
	defm VPBLENDMQ : blendmask_dq <0x64, "vpblendmq", avx512vl_i64_info>, VEX_W;
	defm VPBLENDMB : blendmask_bw <0x66, "vpblendmb", avx512vl_i8_info>;
	defm VPBLENDMW : blendmask_bw <0x66, "vpblendmw", avx512vl_i16_info>, VEX_W;


	//===----------------------------------------------------------------------===//
	// Compare Instructions
	//===----------------------------------------------------------------------===//

	// avx512_cmp_scalar - AVX512 CMPSS and CMPSD

	multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd>{

	defm rr_Int : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
	(outs _.KRC:$dst),
	(ins _.RC:$src1, _.RC:$src2, AVXCC:$cc),
	"vcmp${cc}"#_.Suffix,
	"$src2, $src1", "$src1, $src2",
	(OpNode (_.VT _.RC:$src1),
	(_.VT _.RC:$src2),
	imm:$cc)>, EVEX_4V;
	let mayLoad = 1 in
	defm rm_Int : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
	(outs _.KRC:$dst),
	(ins _.RC:$src1, _.IntScalarMemOp:$src2, AVXCC:$cc),
	"vcmp${cc}"#_.Suffix,
	"$src2, $src1", "$src1, $src2",
	(OpNode (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2,
	imm:$cc)>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>;

	defm rrb_Int : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
	(outs _.KRC:$dst),
	(ins _.RC:$src1, _.RC:$src2, AVXCC:$cc),
	"vcmp${cc}"#_.Suffix,
	"{sae}, $src2, $src1", "$src1, $src2, {sae}",
	(OpNodeRnd (_.VT _.RC:$src1),
	(_.VT _.RC:$src2),
	imm:$cc,
	(i32 FROUND_NO_EXC))>, EVEX_4V, EVEX_B;
	// Accept explicit immediate argument form instead of comparison code.
	let isAsmParserOnly = 1, hasSideEffects = 0 in {
	defm rri_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _,
	(outs VK1:$dst),
	(ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
	"vcmp"#_.Suffix,
	"$cc, $src2, $src1", "$src1, $src2, $cc">, EVEX_4V;
	let mayLoad = 1 in
	defm rmi_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcMem, _,
	(outs _.KRC:$dst),
	(ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$cc),
	"vcmp"#_.Suffix,
	"$cc, $src2, $src1", "$src1, $src2, $cc">,
	EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>;

	defm rrb_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _,
	(outs _.KRC:$dst),
	(ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
	"vcmp"#_.Suffix,
	"$cc, {sae}, $src2, $src1","$src1, $src2, {sae}, $cc">,
	EVEX_4V, EVEX_B;
	}// let isAsmParserOnly = 1, hasSideEffects = 0

	let isCodeGenOnly = 1 in {
	let isCommutable = 1 in
	def rr : AVX512Ii8<0xC2, MRMSrcReg,
	(outs _.KRC:$dst), (ins _.FRC:$src1, _.FRC:$src2, AVXCC:$cc),
	!strconcat("vcmp${cc}", _.Suffix,
	"\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set _.KRC:$dst, (OpNode _.FRC:$src1,
	_.FRC:$src2,
	imm:$cc))],
	IIC_SSE_ALU_F32S_RR>, EVEX_4V;
	def rm : AVX512Ii8<0xC2, MRMSrcMem,
	(outs _.KRC:$dst),
	(ins _.FRC:$src1, _.ScalarMemOp:$src2, AVXCC:$cc),
	!strconcat("vcmp${cc}", _.Suffix,
	"\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set _.KRC:$dst, (OpNode _.FRC:$src1,
	(_.ScalarLdFrag addr:$src2),
	imm:$cc))],
	IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>;
	}
	}

	let Predicates = [HasAVX512] in {
	let ExeDomain = SSEPackedSingle in
	defm VCMPSSZ : avx512_cmp_scalar<f32x_info, X86cmpms, X86cmpmsRnd>,
	AVX512XSIi8Base;
	let ExeDomain = SSEPackedDouble in
	defm VCMPSDZ : avx512_cmp_scalar<f64x_info, X86cmpms, X86cmpmsRnd>,
	AVX512XDIi8Base, VEX_W;
	}

	multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
	X86VectorVTInfo _, bit IsCommutable> {
	let isCommutable = IsCommutable in
	def rr : AVX512BI<opc, MRMSrcReg,
	(outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2)))],
	IIC_SSE_ALU_F32P_RR>, EVEX_4V;
	def rm : AVX512BI<opc, MRMSrcMem,
	(outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1),
	(_.VT (bitconvert (_.LdFrag addr:$src2)))))],
	IIC_SSE_ALU_F32P_RM>, EVEX_4V;
	let isCommutable = IsCommutable in
	def rrk : AVX512BI<opc, MRMSrcReg,
	(outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}}\|",
	"$dst {${mask}}, $src1, $src2}"),
	[(set _.KRC:$dst, (and _.KRCWM:$mask,
	(OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))))],
	IIC_SSE_ALU_F32P_RR>, EVEX_4V, EVEX_K;
	def rmk : AVX512BI<opc, MRMSrcMem,
	(outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}}\|",
	"$dst {${mask}}, $src1, $src2}"),
	[(set _.KRC:$dst, (and _.KRCWM:$mask,
	(OpNode (_.VT _.RC:$src1),
	(_.VT (bitconvert
	(_.LdFrag addr:$src2))))))],
	IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K;
	}

	multiclass avx512_icmp_packed_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
	X86VectorVTInfo _, bit IsCommutable> :
	avx512_icmp_packed<opc, OpcodeStr, OpNode, _, IsCommutable> {
	def rmb : AVX512BI<opc, MRMSrcMem,
	(outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2),
	!strconcat(OpcodeStr, "\t{${src2}", _.BroadcastStr, ", $src1, $dst",
	"\|$dst, $src1, ${src2}", _.BroadcastStr, "}"),
	[(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1),
	(X86VBroadcast (_.ScalarLdFrag addr:$src2))))],
	IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_B;
	def rmbk : AVX512BI<opc, MRMSrcMem,
	(outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1,
	_.ScalarMemOp:$src2),
	!strconcat(OpcodeStr,
	"\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}\|",
	"$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"),
	[(set _.KRC:$dst, (and _.KRCWM:$mask,
	(OpNode (_.VT _.RC:$src1),
	(X86VBroadcast
	(_.ScalarLdFrag addr:$src2)))))],
	IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K, EVEX_B;
	}

	multiclass avx512_icmp_packed_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
	AVX512VLVectorVTInfo VTInfo, Predicate prd,
	bit IsCommutable = 0> {
	let Predicates = [prd] in
	defm Z : avx512_icmp_packed<opc, OpcodeStr, OpNode, VTInfo.info512,
	IsCommutable>, EVEX_V512;

	let Predicates = [prd, HasVLX] in {
	defm Z256 : avx512_icmp_packed<opc, OpcodeStr, OpNode, VTInfo.info256,
	IsCommutable>, EVEX_V256;
	defm Z128 : avx512_icmp_packed<opc, OpcodeStr, OpNode, VTInfo.info128,
	IsCommutable>, EVEX_V128;
	}
	}

	multiclass avx512_icmp_packed_rmb_vl<bits<8> opc, string OpcodeStr,
	SDNode OpNode, AVX512VLVectorVTInfo VTInfo,
	Predicate prd, bit IsCommutable = 0> {
	let Predicates = [prd] in
	defm Z : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, VTInfo.info512,
	IsCommutable>, EVEX_V512;

	let Predicates = [prd, HasVLX] in {
	defm Z256 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, VTInfo.info256,
	IsCommutable>, EVEX_V256;
	defm Z128 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, VTInfo.info128,
	IsCommutable>, EVEX_V128;
	}
	}

	defm VPCMPEQB : avx512_icmp_packed_vl<0x74, "vpcmpeqb", X86pcmpeqm,
	avx512vl_i8_info, HasBWI, 1>,
	EVEX_CD8<8, CD8VF>;

	defm VPCMPEQW : avx512_icmp_packed_vl<0x75, "vpcmpeqw", X86pcmpeqm,
	avx512vl_i16_info, HasBWI, 1>,
	EVEX_CD8<16, CD8VF>;

	defm VPCMPEQD : avx512_icmp_packed_rmb_vl<0x76, "vpcmpeqd", X86pcmpeqm,
	avx512vl_i32_info, HasAVX512, 1>,
	EVEX_CD8<32, CD8VF>;

	defm VPCMPEQQ : avx512_icmp_packed_rmb_vl<0x29, "vpcmpeqq", X86pcmpeqm,
	avx512vl_i64_info, HasAVX512, 1>,
	T8PD, VEX_W, EVEX_CD8<64, CD8VF>;

	defm VPCMPGTB : avx512_icmp_packed_vl<0x64, "vpcmpgtb", X86pcmpgtm,
	avx512vl_i8_info, HasBWI>,
	EVEX_CD8<8, CD8VF>;

	defm VPCMPGTW : avx512_icmp_packed_vl<0x65, "vpcmpgtw", X86pcmpgtm,
	avx512vl_i16_info, HasBWI>,
	EVEX_CD8<16, CD8VF>;

	defm VPCMPGTD : avx512_icmp_packed_rmb_vl<0x66, "vpcmpgtd", X86pcmpgtm,
	avx512vl_i32_info, HasAVX512>,
	EVEX_CD8<32, CD8VF>;

	defm VPCMPGTQ : avx512_icmp_packed_rmb_vl<0x37, "vpcmpgtq", X86pcmpgtm,
	avx512vl_i64_info, HasAVX512>,
	T8PD, VEX_W, EVEX_CD8<64, CD8VF>;


	multiclass avx512_icmp_packed_lowering<X86VectorVTInfo _, X86KVectorVTInfo NewInf,
	SDNode OpNode, string InstrStr,
	list<Predicate> Preds> {
	let Predicates = Preds in {
	def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
	(_.KVT (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))),
	(i64 0)),
	(COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rr) _.RC:$src1, _.RC:$src2),
	NewInf.KRC)>;

	def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
	(_.KVT (OpNode (_.VT _.RC:$src1),
	(_.VT (bitconvert (_.LdFrag addr:$src2))))),
	(i64 0)),
	(COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rm) _.RC:$src1, addr:$src2),
	NewInf.KRC)>;

	def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
	(_.KVT (and _.KRCWM:$mask,
	(OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2)))),
	(i64 0)),
	(COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rrk) _.KRCWM:$mask,
	_.RC:$src1, _.RC:$src2),
	NewInf.KRC)>;

	def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
	(_.KVT (and (_.KVT _.KRCWM:$mask),
	(_.KVT (OpNode (_.VT _.RC:$src1),
	(_.VT (bitconvert
	(_.LdFrag addr:$src2))))))),
	(i64 0)),
	(COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmk) _.KRCWM:$mask,
	_.RC:$src1, addr:$src2),
	NewInf.KRC)>;
	}
	}

	multiclass avx512_icmp_packed_rmb_lowering<X86VectorVTInfo _, X86KVectorVTInfo NewInf,
	SDNode OpNode, string InstrStr,
	list<Predicate> Preds>
	: avx512_icmp_packed_lowering<_, NewInf, OpNode, InstrStr, Preds> {
	let Predicates = Preds in {
	def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
	(_.KVT (OpNode (_.VT _.RC:$src1),
	(X86VBroadcast (_.ScalarLdFrag addr:$src2)))),
	(i64 0)),
	(COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmb) _.RC:$src1, addr:$src2),
	NewInf.KRC)>;

	def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
	(_.KVT (and (_.KVT _.KRCWM:$mask),
	(_.KVT (OpNode (_.VT _.RC:$src1),
	(X86VBroadcast
	(_.ScalarLdFrag addr:$src2)))))),
	(i64 0)),
	(COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmbk) _.KRCWM:$mask,
	_.RC:$src1, addr:$src2),
	NewInf.KRC)>;
	}
	}

	// VPCMPEQB - i8
	defm : avx512_icmp_packed_lowering<v16i8x_info, v32i1_info, X86pcmpeqm,
	"VPCMPEQBZ128", [HasBWI, HasVLX]>;
	defm : avx512_icmp_packed_lowering<v16i8x_info, v64i1_info, X86pcmpeqm,
	"VPCMPEQBZ128", [HasBWI, HasVLX]>;

	defm : avx512_icmp_packed_lowering<v32i8x_info, v64i1_info, X86pcmpeqm,
	"VPCMPEQBZ256", [HasBWI, HasVLX]>;

	// VPCMPEQW - i16
	defm : avx512_icmp_packed_lowering<v8i16x_info, v16i1_info, X86pcmpeqm,
	"VPCMPEQWZ128", [HasBWI, HasVLX]>;
	defm : avx512_icmp_packed_lowering<v8i16x_info, v32i1_info, X86pcmpeqm,
	"VPCMPEQWZ128", [HasBWI, HasVLX]>;
	defm : avx512_icmp_packed_lowering<v8i16x_info, v64i1_info, X86pcmpeqm,
	"VPCMPEQWZ128", [HasBWI, HasVLX]>;

	defm : avx512_icmp_packed_lowering<v16i16x_info, v32i1_info, X86pcmpeqm,
	"VPCMPEQWZ256", [HasBWI, HasVLX]>;
	defm : avx512_icmp_packed_lowering<v16i16x_info, v64i1_info, X86pcmpeqm,
	"VPCMPEQWZ256", [HasBWI, HasVLX]>;

	defm : avx512_icmp_packed_lowering<v32i16_info, v64i1_info, X86pcmpeqm,
	"VPCMPEQWZ", [HasBWI]>;

	// VPCMPEQD - i32
	defm : avx512_icmp_packed_rmb_lowering<v4i32x_info, v8i1_info, X86pcmpeqm,
	"VPCMPEQDZ128", [HasAVX512, HasVLX]>;
	defm : avx512_icmp_packed_rmb_lowering<v4i32x_info, v16i1_info, X86pcmpeqm,
	"VPCMPEQDZ128", [HasAVX512, HasVLX]>;
	defm : avx512_icmp_packed_rmb_lowering<v4i32x_info, v32i1_info, X86pcmpeqm,
	"VPCMPEQDZ128", [HasAVX512, HasVLX]>;
	defm : avx512_icmp_packed_rmb_lowering<v4i32x_info, v64i1_info, X86pcmpeqm,
	"VPCMPEQDZ128", [HasAVX512, HasVLX]>;

	defm : avx512_icmp_packed_rmb_lowering<v8i32x_info, v16i1_info, X86pcmpeqm,
	"VPCMPEQDZ256", [HasAVX512, HasVLX]>;
	defm : avx512_icmp_packed_rmb_lowering<v8i32x_info, v32i1_info, X86pcmpeqm,
	"VPCMPEQDZ256", [HasAVX512, HasVLX]>;
	defm : avx512_icmp_packed_rmb_lowering<v8i32x_info, v64i1_info, X86pcmpeqm,
	"VPCMPEQDZ256", [HasAVX512, HasVLX]>;

	defm : avx512_icmp_packed_rmb_lowering<v16i32_info, v32i1_info, X86pcmpeqm,
	"VPCMPEQDZ", [HasAVX512]>;
	defm : avx512_icmp_packed_rmb_lowering<v16i32_info, v64i1_info, X86pcmpeqm,
	"VPCMPEQDZ", [HasAVX512]>;

	// VPCMPEQQ - i64
	defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v4i1_info, X86pcmpeqm,
	"VPCMPEQQZ128", [HasAVX512, HasVLX]>;
	defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v8i1_info, X86pcmpeqm,
	"VPCMPEQQZ128", [HasAVX512, HasVLX]>;
	defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v16i1_info, X86pcmpeqm,
	"VPCMPEQQZ128", [HasAVX512, HasVLX]>;
	defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v32i1_info, X86pcmpeqm,
	"VPCMPEQQZ128", [HasAVX512, HasVLX]>;
	defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v64i1_info, X86pcmpeqm,
	"VPCMPEQQZ128", [HasAVX512, HasVLX]>;

	defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v8i1_info, X86pcmpeqm,
	"VPCMPEQQZ256", [HasAVX512, HasVLX]>;
	defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v16i1_info, X86pcmpeqm,
	"VPCMPEQQZ256", [HasAVX512, HasVLX]>;
	defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v32i1_info, X86pcmpeqm,
	"VPCMPEQQZ256", [HasAVX512, HasVLX]>;
	defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v64i1_info, X86pcmpeqm,
	"VPCMPEQQZ256", [HasAVX512, HasVLX]>;

	defm : avx512_icmp_packed_rmb_lowering<v8i64_info, v16i1_info, X86pcmpeqm,
	"VPCMPEQQZ", [HasAVX512]>;
	defm : avx512_icmp_packed_rmb_lowering<v8i64_info, v32i1_info, X86pcmpeqm,
	"VPCMPEQQZ", [HasAVX512]>;
	defm : avx512_icmp_packed_rmb_lowering<v8i64_info, v64i1_info, X86pcmpeqm,
	"VPCMPEQQZ", [HasAVX512]>;

	// VPCMPGTB - i8
	defm : avx512_icmp_packed_lowering<v16i8x_info, v32i1_info, X86pcmpgtm,
	"VPCMPGTBZ128", [HasBWI, HasVLX]>;
	defm : avx512_icmp_packed_lowering<v16i8x_info, v64i1_info, X86pcmpgtm,
	"VPCMPGTBZ128", [HasBWI, HasVLX]>;

	defm : avx512_icmp_packed_lowering<v32i8x_info, v64i1_info, X86pcmpgtm,
	"VPCMPGTBZ256", [HasBWI, HasVLX]>;

	// VPCMPGTW - i16
	defm : avx512_icmp_packed_lowering<v8i16x_info, v16i1_info, X86pcmpgtm,
	"VPCMPGTWZ128", [HasBWI, HasVLX]>;
	defm : avx512_icmp_packed_lowering<v8i16x_info, v32i1_info, X86pcmpgtm,
	"VPCMPGTWZ128", [HasBWI, HasVLX]>;
	defm : avx512_icmp_packed_lowering<v8i16x_info, v64i1_info, X86pcmpgtm,
	"VPCMPGTWZ128", [HasBWI, HasVLX]>;

	defm : avx512_icmp_packed_lowering<v16i16x_info, v32i1_info, X86pcmpgtm,
	"VPCMPGTWZ256", [HasBWI, HasVLX]>;
	defm : avx512_icmp_packed_lowering<v16i16x_info, v64i1_info, X86pcmpgtm,
	"VPCMPGTWZ256", [HasBWI, HasVLX]>;

	defm : avx512_icmp_packed_lowering<v32i16_info, v64i1_info, X86pcmpgtm,
	"VPCMPGTWZ", [HasBWI]>;

	// VPCMPGTD - i32
	defm : avx512_icmp_packed_rmb_lowering<v4i32x_info, v8i1_info, X86pcmpgtm,
	"VPCMPGTDZ128", [HasAVX512, HasVLX]>;
	defm : avx512_icmp_packed_rmb_lowering<v4i32x_info, v16i1_info, X86pcmpgtm,
	"VPCMPGTDZ128", [HasAVX512, HasVLX]>;
	defm : avx512_icmp_packed_rmb_lowering<v4i32x_info, v32i1_info, X86pcmpgtm,
	"VPCMPGTDZ128", [HasAVX512, HasVLX]>;
	defm : avx512_icmp_packed_rmb_lowering<v4i32x_info, v64i1_info, X86pcmpgtm,
	"VPCMPGTDZ128", [HasAVX512, HasVLX]>;

	defm : avx512_icmp_packed_rmb_lowering<v8i32x_info, v16i1_info, X86pcmpgtm,
	"VPCMPGTDZ256", [HasAVX512, HasVLX]>;
	defm : avx512_icmp_packed_rmb_lowering<v8i32x_info, v32i1_info, X86pcmpgtm,
	"VPCMPGTDZ256", [HasAVX512, HasVLX]>;
	defm : avx512_icmp_packed_rmb_lowering<v8i32x_info, v64i1_info, X86pcmpgtm,
	"VPCMPGTDZ256", [HasAVX512, HasVLX]>;

	defm : avx512_icmp_packed_rmb_lowering<v16i32_info, v32i1_info, X86pcmpgtm,
	"VPCMPGTDZ", [HasAVX512]>;
	defm : avx512_icmp_packed_rmb_lowering<v16i32_info, v64i1_info, X86pcmpgtm,
	"VPCMPGTDZ", [HasAVX512]>;

	// VPCMPGTQ - i64
	defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v4i1_info, X86pcmpgtm,
	"VPCMPGTQZ128", [HasAVX512, HasVLX]>;
	defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v8i1_info, X86pcmpgtm,
	"VPCMPGTQZ128", [HasAVX512, HasVLX]>;
	defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v16i1_info, X86pcmpgtm,
	"VPCMPGTQZ128", [HasAVX512, HasVLX]>;
	defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v32i1_info, X86pcmpgtm,
	"VPCMPGTQZ128", [HasAVX512, HasVLX]>;
	defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v64i1_info, X86pcmpgtm,
	"VPCMPGTQZ128", [HasAVX512, HasVLX]>;

	defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v8i1_info, X86pcmpgtm,
	"VPCMPGTQZ256", [HasAVX512, HasVLX]>;
	defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v16i1_info, X86pcmpgtm,
	"VPCMPGTQZ256", [HasAVX512, HasVLX]>;
	defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v32i1_info, X86pcmpgtm,
	"VPCMPGTQZ256", [HasAVX512, HasVLX]>;
	defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v64i1_info, X86pcmpgtm,
	"VPCMPGTQZ256", [HasAVX512, HasVLX]>;

	defm : avx512_icmp_packed_rmb_lowering<v8i64_info, v16i1_info, X86pcmpgtm,
	"VPCMPGTQZ", [HasAVX512]>;
	defm : avx512_icmp_packed_rmb_lowering<v8i64_info, v32i1_info, X86pcmpgtm,
	"VPCMPGTQZ", [HasAVX512]>;
	defm : avx512_icmp_packed_rmb_lowering<v8i64_info, v64i1_info, X86pcmpgtm,
	"VPCMPGTQZ", [HasAVX512]>;

	multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode,
	X86VectorVTInfo _> {
	let isCommutable = 1 in
	def rri : AVX512AIi8<opc, MRMSrcReg,
	(outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, AVX512ICC:$cc),
	!strconcat("vpcmp${cc}", Suffix,
	"\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
	imm:$cc))],
	IIC_SSE_ALU_F32P_RR>, EVEX_4V;
	def rmi : AVX512AIi8<opc, MRMSrcMem,
	(outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, AVX512ICC:$cc),
	!strconcat("vpcmp${cc}", Suffix,
	"\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1),
	(_.VT (bitconvert (_.LdFrag addr:$src2))),
	imm:$cc))],
	IIC_SSE_ALU_F32P_RM>, EVEX_4V;
	let isCommutable = 1 in
	def rrik : AVX512AIi8<opc, MRMSrcReg,
	(outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2,
	AVX512ICC:$cc),
	!strconcat("vpcmp${cc}", Suffix,
	"\t{$src2, $src1, $dst {${mask}}\|",
	"$dst {${mask}}, $src1, $src2}"),
	[(set _.KRC:$dst, (and _.KRCWM:$mask,
	(OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
	imm:$cc)))],
	IIC_SSE_ALU_F32P_RR>, EVEX_4V, EVEX_K;
	def rmik : AVX512AIi8<opc, MRMSrcMem,
	(outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2,
	AVX512ICC:$cc),
	!strconcat("vpcmp${cc}", Suffix,
	"\t{$src2, $src1, $dst {${mask}}\|",
	"$dst {${mask}}, $src1, $src2}"),
	[(set _.KRC:$dst, (and _.KRCWM:$mask,
	(OpNode (_.VT _.RC:$src1),
	(_.VT (bitconvert (_.LdFrag addr:$src2))),
	imm:$cc)))],
	IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K;

	// Accept explicit immediate argument form instead of comparison code.
	let isAsmParserOnly = 1, hasSideEffects = 0 in {
	def rri_alt : AVX512AIi8<opc, MRMSrcReg,
	(outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
	!strconcat("vpcmp", Suffix, "\t{$cc, $src2, $src1, $dst\|",
	"$dst, $src1, $src2, $cc}"),
	[], IIC_SSE_ALU_F32P_RR>, EVEX_4V;
	let mayLoad = 1 in
	def rmi_alt : AVX512AIi8<opc, MRMSrcMem,
	(outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc),
	!strconcat("vpcmp", Suffix, "\t{$cc, $src2, $src1, $dst\|",
	"$dst, $src1, $src2, $cc}"),
	[], IIC_SSE_ALU_F32P_RM>, EVEX_4V;
	def rrik_alt : AVX512AIi8<opc, MRMSrcReg,
	(outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2,
	u8imm:$cc),
	!strconcat("vpcmp", Suffix,
	"\t{$cc, $src2, $src1, $dst {${mask}}\|",
	"$dst {${mask}}, $src1, $src2, $cc}"),
	[], IIC_SSE_ALU_F32P_RR>, EVEX_4V, EVEX_K;
	let mayLoad = 1 in
	def rmik_alt : AVX512AIi8<opc, MRMSrcMem,
	(outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2,
	u8imm:$cc),
	!strconcat("vpcmp", Suffix,
	"\t{$cc, $src2, $src1, $dst {${mask}}\|",
	"$dst {${mask}}, $src1, $src2, $cc}"),
	[], IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K;
	}
	}

	multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, SDNode OpNode,
	X86VectorVTInfo _> :
	avx512_icmp_cc<opc, Suffix, OpNode, _> {
	def rmib : AVX512AIi8<opc, MRMSrcMem,
	(outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2,
	AVX512ICC:$cc),
	!strconcat("vpcmp${cc}", Suffix,
	"\t{${src2}", _.BroadcastStr, ", $src1, $dst\|",
	"$dst, $src1, ${src2}", _.BroadcastStr, "}"),
	[(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1),
	(X86VBroadcast (_.ScalarLdFrag addr:$src2)),
	imm:$cc))],
	IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_B;
	def rmibk : AVX512AIi8<opc, MRMSrcMem,
	(outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1,
	_.ScalarMemOp:$src2, AVX512ICC:$cc),
	!strconcat("vpcmp${cc}", Suffix,
	"\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}\|",
	"$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"),
	[(set _.KRC:$dst, (and _.KRCWM:$mask,
	(OpNode (_.VT _.RC:$src1),
	(X86VBroadcast (_.ScalarLdFrag addr:$src2)),
	imm:$cc)))],
	IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K, EVEX_B;

	// Accept explicit immediate argument form instead of comparison code.
	let isAsmParserOnly = 1, hasSideEffects = 0, mayLoad = 1 in {
	def rmib_alt : AVX512AIi8<opc, MRMSrcMem,
	(outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2,
	u8imm:$cc),
	!strconcat("vpcmp", Suffix,
	"\t{$cc, ${src2}", _.BroadcastStr, ", $src1, $dst\|",
	"$dst, $src1, ${src2}", _.BroadcastStr, ", $cc}"),
	[], IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_B;
	def rmibk_alt : AVX512AIi8<opc, MRMSrcMem,
	(outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1,
	_.ScalarMemOp:$src2, u8imm:$cc),
	!strconcat("vpcmp", Suffix,
	"\t{$cc, ${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}\|",
	"$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, ", $cc}"),
	[], IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K, EVEX_B;
	}
	}

	multiclass avx512_icmp_cc_vl<bits<8> opc, string Suffix, SDNode OpNode,
	AVX512VLVectorVTInfo VTInfo, Predicate prd> {
	let Predicates = [prd] in
	defm Z : avx512_icmp_cc<opc, Suffix, OpNode, VTInfo.info512>, EVEX_V512;

	let Predicates = [prd, HasVLX] in {
	defm Z256 : avx512_icmp_cc<opc, Suffix, OpNode, VTInfo.info256>, EVEX_V256;
	defm Z128 : avx512_icmp_cc<opc, Suffix, OpNode, VTInfo.info128>, EVEX_V128;
	}
	}

	multiclass avx512_icmp_cc_rmb_vl<bits<8> opc, string Suffix, SDNode OpNode,
	AVX512VLVectorVTInfo VTInfo, Predicate prd> {
	let Predicates = [prd] in
	defm Z : avx512_icmp_cc_rmb<opc, Suffix, OpNode, VTInfo.info512>,
	EVEX_V512;

	let Predicates = [prd, HasVLX] in {
	defm Z256 : avx512_icmp_cc_rmb<opc, Suffix, OpNode, VTInfo.info256>,
	EVEX_V256;
	defm Z128 : avx512_icmp_cc_rmb<opc, Suffix, OpNode, VTInfo.info128>,
	EVEX_V128;
	}
	}

	defm VPCMPB : avx512_icmp_cc_vl<0x3F, "b", X86cmpm, avx512vl_i8_info,
	HasBWI>, EVEX_CD8<8, CD8VF>;
	defm VPCMPUB : avx512_icmp_cc_vl<0x3E, "ub", X86cmpmu, avx512vl_i8_info,
	HasBWI>, EVEX_CD8<8, CD8VF>;

	defm VPCMPW : avx512_icmp_cc_vl<0x3F, "w", X86cmpm, avx512vl_i16_info,
	HasBWI>, VEX_W, EVEX_CD8<16, CD8VF>;
	defm VPCMPUW : avx512_icmp_cc_vl<0x3E, "uw", X86cmpmu, avx512vl_i16_info,
	HasBWI>, VEX_W, EVEX_CD8<16, CD8VF>;

	defm VPCMPD : avx512_icmp_cc_rmb_vl<0x1F, "d", X86cmpm, avx512vl_i32_info,
	HasAVX512>, EVEX_CD8<32, CD8VF>;
	defm VPCMPUD : avx512_icmp_cc_rmb_vl<0x1E, "ud", X86cmpmu, avx512vl_i32_info,
	HasAVX512>, EVEX_CD8<32, CD8VF>;

	defm VPCMPQ : avx512_icmp_cc_rmb_vl<0x1F, "q", X86cmpm, avx512vl_i64_info,
	HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>;
	defm VPCMPUQ : avx512_icmp_cc_rmb_vl<0x1E, "uq", X86cmpmu, avx512vl_i64_info,
	HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>;

	multiclass avx512_icmp_cc_packed_lowering<X86VectorVTInfo _, X86KVectorVTInfo NewInf,
	SDNode OpNode, string InstrStr,
	list<Predicate> Preds> {
	let Predicates = Preds in {
	def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
	(_.KVT (OpNode (_.VT _.RC:$src1),
	(_.VT _.RC:$src2),
	imm:$cc)),
	(i64 0)),
	(COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rri) _.RC:$src1,
	_.RC:$src2,
	imm:$cc),
	NewInf.KRC)>;

	def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
	(_.KVT (OpNode (_.VT _.RC:$src1),
	(_.VT (bitconvert (_.LdFrag addr:$src2))),
	imm:$cc)),
	(i64 0)),
	(COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmi) _.RC:$src1,
	addr:$src2,
	imm:$cc),
	NewInf.KRC)>;

	def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
	(_.KVT (and _.KRCWM:$mask,
	(OpNode (_.VT _.RC:$src1),
	(_.VT _.RC:$src2),
	imm:$cc))),
	(i64 0)),
	(COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rrik) _.KRCWM:$mask,
	_.RC:$src1,
	_.RC:$src2,
	imm:$cc),
	NewInf.KRC)>;

	def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
	(_.KVT (and (_.KVT _.KRCWM:$mask),
	(_.KVT (OpNode (_.VT _.RC:$src1),
	(_.VT (bitconvert
	(_.LdFrag addr:$src2))),
	imm:$cc)))),
	(i64 0)),
	(COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmik) _.KRCWM:$mask,
	_.RC:$src1,
	addr:$src2,
	imm:$cc),
	NewInf.KRC)>;
	}
	}

	multiclass avx512_icmp_cc_packed_rmb_lowering<X86VectorVTInfo _, X86KVectorVTInfo NewInf,
	SDNode OpNode, string InstrStr,
	list<Predicate> Preds>
	: avx512_icmp_cc_packed_lowering<_, NewInf, OpNode, InstrStr, Preds> {
	let Predicates = Preds in {
	def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
	(_.KVT (OpNode (_.VT _.RC:$src1),
	(X86VBroadcast (_.ScalarLdFrag addr:$src2)),
	imm:$cc)),
	(i64 0)),
	(COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmib) _.RC:$src1,
	addr:$src2,
	imm:$cc),
	NewInf.KRC)>;

	def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
	(_.KVT (and (_.KVT _.KRCWM:$mask),
	(_.KVT (OpNode (_.VT _.RC:$src1),
	(X86VBroadcast
	(_.ScalarLdFrag addr:$src2)),
	imm:$cc)))),
	(i64 0)),
	(COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmibk) _.KRCWM:$mask,
	_.RC:$src1,
	addr:$src2,
	imm:$cc),
	NewInf.KRC)>;
	}
	}

	// VPCMPB - i8
	defm : avx512_icmp_cc_packed_lowering<v16i8x_info, v32i1_info, X86cmpm,
	"VPCMPBZ128", [HasBWI, HasVLX]>;
	defm : avx512_icmp_cc_packed_lowering<v16i8x_info, v64i1_info, X86cmpm,
	"VPCMPBZ128", [HasBWI, HasVLX]>;

	defm : avx512_icmp_cc_packed_lowering<v32i8x_info, v64i1_info, X86cmpm,
	"VPCMPBZ256", [HasBWI, HasVLX]>;

	// VPCMPW - i16
	defm : avx512_icmp_cc_packed_lowering<v8i16x_info, v16i1_info, X86cmpm,
	"VPCMPWZ128", [HasBWI, HasVLX]>;
	defm : avx512_icmp_cc_packed_lowering<v8i16x_info, v32i1_info, X86cmpm,
	"VPCMPWZ128", [HasBWI, HasVLX]>;
	defm : avx512_icmp_cc_packed_lowering<v8i16x_info, v64i1_info, X86cmpm,
	"VPCMPWZ128", [HasBWI, HasVLX]>;

	defm : avx512_icmp_cc_packed_lowering<v16i16x_info, v32i1_info, X86cmpm,
	"VPCMPWZ256", [HasBWI, HasVLX]>;
	defm : avx512_icmp_cc_packed_lowering<v16i16x_info, v64i1_info, X86cmpm,
	"VPCMPWZ256", [HasBWI, HasVLX]>;

	defm : avx512_icmp_cc_packed_lowering<v32i16_info, v64i1_info, X86cmpm,
	"VPCMPWZ", [HasBWI]>;

	// VPCMPD - i32
	defm : avx512_icmp_cc_packed_rmb_lowering<v4i32x_info, v8i1_info, X86cmpm,
	"VPCMPDZ128", [HasAVX512, HasVLX]>;
	defm : avx512_icmp_cc_packed_rmb_lowering<v4i32x_info, v16i1_info, X86cmpm,
	"VPCMPDZ128", [HasAVX512, HasVLX]>;
	defm : avx512_icmp_cc_packed_rmb_lowering<v4i32x_info, v32i1_info, X86cmpm,
	"VPCMPDZ128", [HasAVX512, HasVLX]>;
	defm : avx512_icmp_cc_packed_rmb_lowering<v4i32x_info, v64i1_info, X86cmpm,
	"VPCMPDZ128", [HasAVX512, HasVLX]>;

	defm : avx512_icmp_cc_packed_rmb_lowering<v8i32x_info, v16i1_info, X86cmpm,
	"VPCMPDZ256", [HasAVX512, HasVLX]>;
	defm : avx512_icmp_cc_packed_rmb_lowering<v8i32x_info, v32i1_info, X86cmpm,
	"VPCMPDZ256", [HasAVX512, HasVLX]>;
	defm : avx512_icmp_cc_packed_rmb_lowering<v8i32x_info, v64i1_info, X86cmpm,
	"VPCMPDZ256", [HasAVX512, HasVLX]>;

	defm : avx512_icmp_cc_packed_rmb_lowering<v16i32_info, v32i1_info, X86cmpm,
	"VPCMPDZ", [HasAVX512]>;
	defm : avx512_icmp_cc_packed_rmb_lowering<v16i32_info, v64i1_info, X86cmpm,
	"VPCMPDZ", [HasAVX512]>;

	// VPCMPQ - i64
	defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v4i1_info, X86cmpm,
	"VPCMPQZ128", [HasAVX512, HasVLX]>;
	defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v8i1_info, X86cmpm,
	"VPCMPQZ128", [HasAVX512, HasVLX]>;
	defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v16i1_info, X86cmpm,
	"VPCMPQZ128", [HasAVX512, HasVLX]>;
	defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v32i1_info, X86cmpm,
	"VPCMPQZ128", [HasAVX512, HasVLX]>;
	defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v64i1_info, X86cmpm,
	"VPCMPQZ128", [HasAVX512, HasVLX]>;

	defm : avx512_icmp_cc_packed_rmb_lowering<v4i64x_info, v8i1_info, X86cmpm,
	"VPCMPQZ256", [HasAVX512, HasVLX]>;
	defm : avx512_icmp_cc_packed_rmb_lowering<v4i64x_info, v16i1_info, X86cmpm,
	"VPCMPQZ256", [HasAVX512, HasVLX]>;
	defm : avx512_icmp_cc_packed_rmb_lowering<v4i64x_info, v32i1_info, X86cmpm,
	"VPCMPQZ256", [HasAVX512, HasVLX]>;
	defm : avx512_icmp_cc_packed_rmb_lowering<v4i64x_info, v64i1_info, X86cmpm,
	"VPCMPQZ256", [HasAVX512, HasVLX]>;

	defm : avx512_icmp_cc_packed_rmb_lowering<v8i64_info, v16i1_info, X86cmpm,
	"VPCMPQZ", [HasAVX512]>;
	defm : avx512_icmp_cc_packed_rmb_lowering<v8i64_info, v32i1_info, X86cmpm,
	"VPCMPQZ", [HasAVX512]>;
	defm : avx512_icmp_cc_packed_rmb_lowering<v8i64_info, v64i1_info, X86cmpm,
	"VPCMPQZ", [HasAVX512]>;

	// VPCMPUB - i8
	defm : avx512_icmp_cc_packed_lowering<v16i8x_info, v32i1_info, X86cmpmu,
	"VPCMPUBZ128", [HasBWI, HasVLX]>;
	defm : avx512_icmp_cc_packed_lowering<v16i8x_info, v64i1_info, X86cmpmu,
	"VPCMPUBZ128", [HasBWI, HasVLX]>;

	defm : avx512_icmp_cc_packed_lowering<v32i8x_info, v64i1_info, X86cmpmu,
	"VPCMPUBZ256", [HasBWI, HasVLX]>;

	// VPCMPUW - i16
	defm : avx512_icmp_cc_packed_lowering<v8i16x_info, v16i1_info, X86cmpmu,
	"VPCMPUWZ128", [HasBWI, HasVLX]>;
	defm : avx512_icmp_cc_packed_lowering<v8i16x_info, v32i1_info, X86cmpmu,
	"VPCMPUWZ128", [HasBWI, HasVLX]>;
	defm : avx512_icmp_cc_packed_lowering<v8i16x_info, v64i1_info, X86cmpmu,
	"VPCMPUWZ128", [HasBWI, HasVLX]>;

	defm : avx512_icmp_cc_packed_lowering<v16i16x_info, v32i1_info, X86cmpmu,
	"VPCMPUWZ256", [HasBWI, HasVLX]>;
	defm : avx512_icmp_cc_packed_lowering<v16i16x_info, v64i1_info, X86cmpmu,
	"VPCMPUWZ256", [HasBWI, HasVLX]>;

	defm : avx512_icmp_cc_packed_lowering<v32i16_info, v64i1_info, X86cmpmu,
	"VPCMPUWZ", [HasBWI]>;

	// VPCMPUD - i32
	defm : avx512_icmp_cc_packed_rmb_lowering<v4i32x_info, v8i1_info, X86cmpmu,
	"VPCMPUDZ128", [HasAVX512, HasVLX]>;
	defm : avx512_icmp_cc_packed_rmb_lowering<v4i32x_info, v16i1_info, X86cmpmu,
	"VPCMPUDZ128", [HasAVX512, HasVLX]>;
	defm : avx512_icmp_cc_packed_rmb_lowering<v4i32x_info, v32i1_info, X86cmpmu,
	"VPCMPUDZ128", [HasAVX512, HasVLX]>;
	defm : avx512_icmp_cc_packed_rmb_lowering<v4i32x_info, v64i1_info, X86cmpmu,
	"VPCMPUDZ128", [HasAVX512, HasVLX]>;

	defm : avx512_icmp_cc_packed_rmb_lowering<v8i32x_info, v16i1_info, X86cmpmu,
	"VPCMPUDZ256", [HasAVX512, HasVLX]>;
	defm : avx512_icmp_cc_packed_rmb_lowering<v8i32x_info, v32i1_info, X86cmpmu,
	"VPCMPUDZ256", [HasAVX512, HasVLX]>;
	defm : avx512_icmp_cc_packed_rmb_lowering<v8i32x_info, v64i1_info, X86cmpmu,
	"VPCMPUDZ256", [HasAVX512, HasVLX]>;

	defm : avx512_icmp_cc_packed_rmb_lowering<v16i32_info, v32i1_info, X86cmpmu,
	"VPCMPUDZ", [HasAVX512]>;
	defm : avx512_icmp_cc_packed_rmb_lowering<v16i32_info, v64i1_info, X86cmpmu,
	"VPCMPUDZ", [HasAVX512]>;

	// VPCMPUQ - i64
	defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v4i1_info, X86cmpmu,
	"VPCMPUQZ128", [HasAVX512, HasVLX]>;
	defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v8i1_info, X86cmpmu,
	"VPCMPUQZ128", [HasAVX512, HasVLX]>;
	defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v16i1_info, X86cmpmu,
	"VPCMPUQZ128", [HasAVX512, HasVLX]>;
	defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v32i1_info, X86cmpmu,
	"VPCMPUQZ128", [HasAVX512, HasVLX]>;
	defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v64i1_info, X86cmpmu,
	"VPCMPUQZ128", [HasAVX512, HasVLX]>;

	defm : avx512_icmp_cc_packed_rmb_lowering<v4i64x_info, v8i1_info, X86cmpmu,
	"VPCMPUQZ256", [HasAVX512, HasVLX]>;
	defm : avx512_icmp_cc_packed_rmb_lowering<v4i64x_info, v16i1_info, X86cmpmu,
	"VPCMPUQZ256", [HasAVX512, HasVLX]>;
	defm : avx512_icmp_cc_packed_rmb_lowering<v4i64x_info, v32i1_info, X86cmpmu,
	"VPCMPUQZ256", [HasAVX512, HasVLX]>;
	defm : avx512_icmp_cc_packed_rmb_lowering<v4i64x_info, v64i1_info, X86cmpmu,
	"VPCMPUQZ256", [HasAVX512, HasVLX]>;

	defm : avx512_icmp_cc_packed_rmb_lowering<v8i64_info, v16i1_info, X86cmpmu,
	"VPCMPUQZ", [HasAVX512]>;
	defm : avx512_icmp_cc_packed_rmb_lowering<v8i64_info, v32i1_info, X86cmpmu,
	"VPCMPUQZ", [HasAVX512]>;
	defm : avx512_icmp_cc_packed_rmb_lowering<v8i64_info, v64i1_info, X86cmpmu,
	"VPCMPUQZ", [HasAVX512]>;

	multiclass avx512_vcmp_common<X86VectorVTInfo _> {

	defm rri : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
	(outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2,AVXCC:$cc),
	"vcmp${cc}"#_.Suffix,
	"$src2, $src1", "$src1, $src2",
	(X86cmpm (_.VT _.RC:$src1),
	(_.VT _.RC:$src2),
	imm:$cc), 1>;

	defm rmi : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
	(outs _.KRC:$dst),(ins _.RC:$src1, _.MemOp:$src2, AVXCC:$cc),
	"vcmp${cc}"#_.Suffix,
	"$src2, $src1", "$src1, $src2",
	(X86cmpm (_.VT _.RC:$src1),
	(_.VT (bitconvert (_.LdFrag addr:$src2))),
	imm:$cc)>;

	defm rmbi : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
	(outs _.KRC:$dst),
	(ins _.RC:$src1, _.ScalarMemOp:$src2, AVXCC:$cc),
	"vcmp${cc}"#_.Suffix,
	"${src2}"##_.BroadcastStr##", $src1",
	"$src1, ${src2}"##_.BroadcastStr,
	(X86cmpm (_.VT _.RC:$src1),
	(_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
	imm:$cc)>,EVEX_B;
	// Accept explicit immediate argument form instead of comparison code.
	let isAsmParserOnly = 1, hasSideEffects = 0 in {
	defm rri_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _,
	(outs _.KRC:$dst),
	(ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
	"vcmp"#_.Suffix,
	"$cc, $src2, $src1", "$src1, $src2, $cc">;

	let mayLoad = 1 in {
	defm rmi_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcMem, _,
	(outs _.KRC:$dst),
	(ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc),
	"vcmp"#_.Suffix,
	"$cc, $src2, $src1", "$src1, $src2, $cc">;

	defm rmbi_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcMem, _,
	(outs _.KRC:$dst),
	(ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$cc),
	"vcmp"#_.Suffix,
	"$cc, ${src2}"##_.BroadcastStr##", $src1",
	"$src1, ${src2}"##_.BroadcastStr##", $cc">,EVEX_B;
	}
	}
	}

	multiclass avx512_vcmp_sae<X86VectorVTInfo _> {
	// comparison code form (VCMP[EQ/LT/LE/...]
	defm rrib : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
	(outs _.KRC:$dst),(ins _.RC:$src1, _.RC:$src2, AVXCC:$cc),
	"vcmp${cc}"#_.Suffix,
	"{sae}, $src2, $src1", "$src1, $src2, {sae}",
	(X86cmpmRnd (_.VT _.RC:$src1),
	(_.VT _.RC:$src2),
	imm:$cc,
	(i32 FROUND_NO_EXC))>, EVEX_B;

	let isAsmParserOnly = 1, hasSideEffects = 0 in {
	defm rrib_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _,
	(outs _.KRC:$dst),
	(ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
	"vcmp"#_.Suffix,
	"$cc, {sae}, $src2, $src1",
	"$src1, $src2, {sae}, $cc">, EVEX_B;
	}
	}

	multiclass avx512_vcmp<AVX512VLVectorVTInfo _> {
	let Predicates = [HasAVX512] in {
	defm Z : avx512_vcmp_common<_.info512>,
	avx512_vcmp_sae<_.info512>, EVEX_V512;

	}
	let Predicates = [HasAVX512,HasVLX] in {
	defm Z128 : avx512_vcmp_common<_.info128>, EVEX_V128;
	defm Z256 : avx512_vcmp_common<_.info256>, EVEX_V256;
	}
	}

	defm VCMPPD : avx512_vcmp<avx512vl_f64_info>,
	AVX512PDIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
	defm VCMPPS : avx512_vcmp<avx512vl_f32_info>,
	AVX512PSIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;

	multiclass avx512_fcmp_cc_packed_lowering<X86VectorVTInfo _, X86KVectorVTInfo NewInf,
	string InstrStr, list<Predicate> Preds> {
	let Predicates = Preds in {
	def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
	(_.KVT (X86cmpm (_.VT _.RC:$src1),
	(_.VT _.RC:$src2),
	imm:$cc)),
	(i64 0)),
	(COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rri) _.RC:$src1,
	_.RC:$src2,
	imm:$cc),
	NewInf.KRC)>;

	def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
	(_.KVT (X86cmpm (_.VT _.RC:$src1),
	(_.VT (bitconvert (_.LdFrag addr:$src2))),
	imm:$cc)),
	(i64 0)),
	(COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmi) _.RC:$src1,
	addr:$src2,
	imm:$cc),
	NewInf.KRC)>;

	def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
	(_.KVT (X86cmpm (_.VT _.RC:$src1),
	(X86VBroadcast (_.ScalarLdFrag addr:$src2)),
	imm:$cc)),
	(i64 0)),
	(COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmbi) _.RC:$src1,
	addr:$src2,
	imm:$cc),
	NewInf.KRC)>;
	}
	}

	multiclass avx512_fcmp_cc_packed_sae_lowering<X86VectorVTInfo _, X86KVectorVTInfo NewInf,
	string InstrStr, list<Predicate> Preds>
	: avx512_fcmp_cc_packed_lowering<_, NewInf, InstrStr, Preds> {

	let Predicates = Preds in
	def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
	(_.KVT (X86cmpmRnd (_.VT _.RC:$src1),
	(_.VT _.RC:$src2),
	imm:$cc,
	(i32 FROUND_NO_EXC))),
	(i64 0)),
	(COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rrib) _.RC:$src1,
	_.RC:$src2,
	imm:$cc),
	NewInf.KRC)>;
	}


	// VCMPPS - f32
	defm : avx512_fcmp_cc_packed_lowering<v4f32x_info, v8i1_info, "VCMPPSZ128",
	[HasAVX512, HasVLX]>;
	defm : avx512_fcmp_cc_packed_lowering<v4f32x_info, v16i1_info, "VCMPPSZ128",
	[HasAVX512, HasVLX]>;
	defm : avx512_fcmp_cc_packed_lowering<v4f32x_info, v32i1_info, "VCMPPSZ128",
	[HasAVX512, HasVLX]>;
	defm : avx512_fcmp_cc_packed_lowering<v4f32x_info, v64i1_info, "VCMPPSZ128",
	[HasAVX512, HasVLX]>;

	defm : avx512_fcmp_cc_packed_lowering<v8f32x_info, v16i1_info, "VCMPPSZ256",
	[HasAVX512, HasVLX]>;
	defm : avx512_fcmp_cc_packed_lowering<v8f32x_info, v32i1_info, "VCMPPSZ256",
	[HasAVX512, HasVLX]>;
	defm : avx512_fcmp_cc_packed_lowering<v8f32x_info, v64i1_info, "VCMPPSZ256",
	[HasAVX512, HasVLX]>;

	defm : avx512_fcmp_cc_packed_sae_lowering<v16f32_info, v32i1_info, "VCMPPSZ",
	[HasAVX512]>;
	defm : avx512_fcmp_cc_packed_sae_lowering<v16f32_info, v64i1_info, "VCMPPSZ",
	[HasAVX512]>;

	// VCMPPD - f64
	defm : avx512_fcmp_cc_packed_lowering<v2f64x_info, v4i1_info, "VCMPPDZ128",
	[HasAVX512, HasVLX]>;
	defm : avx512_fcmp_cc_packed_lowering<v2f64x_info, v8i1_info, "VCMPPDZ128",
	[HasAVX512, HasVLX]>;
	defm : avx512_fcmp_cc_packed_lowering<v2f64x_info, v16i1_info, "VCMPPDZ128",
	[HasAVX512, HasVLX]>;
	defm : avx512_fcmp_cc_packed_lowering<v2f64x_info, v32i1_info, "VCMPPDZ128",
	[HasAVX512, HasVLX]>;
	defm : avx512_fcmp_cc_packed_lowering<v2f64x_info, v64i1_info, "VCMPPDZ128",
	[HasAVX512, HasVLX]>;

	defm : avx512_fcmp_cc_packed_lowering<v4f64x_info, v8i1_info, "VCMPPDZ256",
	[HasAVX512, HasVLX]>;
	defm : avx512_fcmp_cc_packed_lowering<v4f64x_info, v16i1_info, "VCMPPDZ256",
	[HasAVX512, HasVLX]>;
	defm : avx512_fcmp_cc_packed_lowering<v4f64x_info, v32i1_info, "VCMPPDZ256",
	[HasAVX512, HasVLX]>;
	defm : avx512_fcmp_cc_packed_lowering<v4f64x_info, v64i1_info, "VCMPPDZ256",
	[HasAVX512, HasVLX]>;

	defm : avx512_fcmp_cc_packed_sae_lowering<v8f64_info, v16i1_info, "VCMPPDZ",
	[HasAVX512]>;
	defm : avx512_fcmp_cc_packed_sae_lowering<v8f64_info, v32i1_info, "VCMPPDZ",
	[HasAVX512]>;
	defm : avx512_fcmp_cc_packed_sae_lowering<v8f64_info, v64i1_info, "VCMPPDZ",
	[HasAVX512]>;

	// ----------------------------------------------------------------
	// FPClass
	//handle fpclass instruction mask = op(reg_scalar,imm)
	// op(mem_scalar,imm)
	multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode,
	X86VectorVTInfo _, Predicate prd> {
	let Predicates = [prd] in {
	def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),//_.KRC:$dst),
	(ins _.RC:$src1, i32u8imm:$src2),
	OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst\|$dst, $src1, $src2}",
	[(set _.KRC:$dst,(OpNode (_.VT _.RC:$src1),
	(i32 imm:$src2)))], NoItinerary>;
	def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
	(ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2),
	OpcodeStr##_.Suffix#
	"\t{$src2, $src1, $dst {${mask}}\|$dst {${mask}}, $src1, $src2}",
	[(set _.KRC:$dst,(or _.KRCWM:$mask,
	(OpNode (_.VT _.RC:$src1),
	(i32 imm:$src2))))], NoItinerary>, EVEX_K;
	def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
	(ins _.MemOp:$src1, i32u8imm:$src2),
	OpcodeStr##_.Suffix##
	"\t{$src2, $src1, $dst\|$dst, $src1, $src2}",
	[(set _.KRC:$dst,
	(OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
	(i32 imm:$src2)))], NoItinerary>;
	def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
	(ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2),
	OpcodeStr##_.Suffix##
	"\t{$src2, $src1, $dst {${mask}}\|$dst {${mask}}, $src1, $src2}",
	[(set _.KRC:$dst,(or _.KRCWM:$mask,
	(OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
	(i32 imm:$src2))))], NoItinerary>, EVEX_K;
	}
	}

	//handle fpclass instruction mask = fpclass(reg_vec, reg_vec, imm)
	// fpclass(reg_vec, mem_vec, imm)
	// fpclass(reg_vec, broadcast(eltVt), imm)
	multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode,
	X86VectorVTInfo _, string mem, string broadcast>{
	def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
	(ins _.RC:$src1, i32u8imm:$src2),
	OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst\|$dst, $src1, $src2}",
	[(set _.KRC:$dst,(OpNode (_.VT _.RC:$src1),
	(i32 imm:$src2)))], NoItinerary>;
	def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
	(ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2),
	OpcodeStr##_.Suffix#
	"\t{$src2, $src1, $dst {${mask}}\|$dst {${mask}}, $src1, $src2}",
	[(set _.KRC:$dst,(or _.KRCWM:$mask,
	(OpNode (_.VT _.RC:$src1),
	(i32 imm:$src2))))], NoItinerary>, EVEX_K;
	def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
	(ins _.MemOp:$src1, i32u8imm:$src2),
	OpcodeStr##_.Suffix##mem#
	"\t{$src2, $src1, $dst\|$dst, $src1, $src2}",
	[(set _.KRC:$dst,(OpNode
	(_.VT (bitconvert (_.LdFrag addr:$src1))),
	(i32 imm:$src2)))], NoItinerary>;
	def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
	(ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2),
	OpcodeStr##_.Suffix##mem#
	"\t{$src2, $src1, $dst {${mask}}\|$dst {${mask}}, $src1, $src2}",
	[(set _.KRC:$dst, (or _.KRCWM:$mask, (OpNode
	(_.VT (bitconvert (_.LdFrag addr:$src1))),
	(i32 imm:$src2))))], NoItinerary>, EVEX_K;
	def rmb : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
	(ins _.ScalarMemOp:$src1, i32u8imm:$src2),
	OpcodeStr##_.Suffix##broadcast##"\t{$src2, ${src1}"##
	_.BroadcastStr##", $dst\|$dst, ${src1}"
	##_.BroadcastStr##", $src2}",
	[(set _.KRC:$dst,(OpNode
	(_.VT (X86VBroadcast
	(_.ScalarLdFrag addr:$src1))),
	(i32 imm:$src2)))], NoItinerary>,EVEX_B;
	def rmbk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
	(ins _.KRCWM:$mask, _.ScalarMemOp:$src1, i32u8imm:$src2),
	OpcodeStr##_.Suffix##broadcast##"\t{$src2, ${src1}"##
	_.BroadcastStr##", $dst {${mask}}\|$dst {${mask}}, ${src1}"##
	_.BroadcastStr##", $src2}",
	[(set _.KRC:$dst,(or _.KRCWM:$mask, (OpNode
	(_.VT (X86VBroadcast
	(_.ScalarLdFrag addr:$src1))),
	(i32 imm:$src2))))], NoItinerary>,
	EVEX_B, EVEX_K;
	}

	multiclass avx512_vector_fpclass_all<string OpcodeStr,
	AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode, Predicate prd,
	string broadcast>{
	let Predicates = [prd] in {
	defm Z : avx512_vector_fpclass<opc, OpcodeStr, OpNode, _.info512, "{z}",
	broadcast>, EVEX_V512;
	}
	let Predicates = [prd, HasVLX] in {
	defm Z128 : avx512_vector_fpclass<opc, OpcodeStr, OpNode, _.info128, "{x}",
	broadcast>, EVEX_V128;
	defm Z256 : avx512_vector_fpclass<opc, OpcodeStr, OpNode, _.info256, "{y}",
	broadcast>, EVEX_V256;
	}
	}

	multiclass avx512_fp_fpclass_all<string OpcodeStr, bits<8> opcVec,
	bits<8> opcScalar, SDNode VecOpNode, SDNode ScalarOpNode, Predicate prd>{
	defm PS : avx512_vector_fpclass_all<OpcodeStr, avx512vl_f32_info, opcVec,
	VecOpNode, prd, "{l}">, EVEX_CD8<32, CD8VF>;
	defm PD : avx512_vector_fpclass_all<OpcodeStr, avx512vl_f64_info, opcVec,
	VecOpNode, prd, "{q}">,EVEX_CD8<64, CD8VF> , VEX_W;
	defm SS : avx512_scalar_fpclass<opcScalar, OpcodeStr, ScalarOpNode,
	f32x_info, prd>, EVEX_CD8<32, CD8VT1>;
	defm SD : avx512_scalar_fpclass<opcScalar, OpcodeStr, ScalarOpNode,
	f64x_info, prd>, EVEX_CD8<64, CD8VT1>, VEX_W;
	}

	defm VFPCLASS : avx512_fp_fpclass_all<"vfpclass", 0x66, 0x67, X86Vfpclass,
	X86Vfpclasss, HasDQI>, AVX512AIi8Base,EVEX;

	//-----------------------------------------------------------------
	// Mask register copy, including
	// - copy between mask registers
	// - load/store mask registers
	// - copy from GPR to mask register and vice versa
	//
	multiclass avx512_mask_mov<bits<8> opc_kk, bits<8> opc_km, bits<8> opc_mk,
	string OpcodeStr, RegisterClass KRC,
	ValueType vvt, X86MemOperand x86memop> {
	let hasSideEffects = 0 in
	def kk : I<opc_kk, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src),
	!strconcat(OpcodeStr, "\t{$src, $dst\|$dst, $src}"), []>;
	def km : I<opc_km, MRMSrcMem, (outs KRC:$dst), (ins x86memop:$src),
	!strconcat(OpcodeStr, "\t{$src, $dst\|$dst, $src}"),
	[(set KRC:$dst, (vvt (load addr:$src)))]>;
	def mk : I<opc_mk, MRMDestMem, (outs), (ins x86memop:$dst, KRC:$src),
	!strconcat(OpcodeStr, "\t{$src, $dst\|$dst, $src}"),
	[(store KRC:$src, addr:$dst)]>;
	}

	multiclass avx512_mask_mov_gpr<bits<8> opc_kr, bits<8> opc_rk,
	string OpcodeStr,
	RegisterClass KRC, RegisterClass GRC> {
	let hasSideEffects = 0 in {
	def kr : I<opc_kr, MRMSrcReg, (outs KRC:$dst), (ins GRC:$src),
	!strconcat(OpcodeStr, "\t{$src, $dst\|$dst, $src}"), []>;
	def rk : I<opc_rk, MRMSrcReg, (outs GRC:$dst), (ins KRC:$src),
	!strconcat(OpcodeStr, "\t{$src, $dst\|$dst, $src}"), []>;
	}
	}

	let Predicates = [HasDQI] in
	defm KMOVB : avx512_mask_mov<0x90, 0x90, 0x91, "kmovb", VK8, v8i1, i8mem>,
	avx512_mask_mov_gpr<0x92, 0x93, "kmovb", VK8, GR32>,
	VEX, PD;

	let Predicates = [HasAVX512] in
	defm KMOVW : avx512_mask_mov<0x90, 0x90, 0x91, "kmovw", VK16, v16i1, i16mem>,
	avx512_mask_mov_gpr<0x92, 0x93, "kmovw", VK16, GR32>,
	VEX, PS;

	let Predicates = [HasBWI] in {
	defm KMOVD : avx512_mask_mov<0x90, 0x90, 0x91, "kmovd", VK32, v32i1,i32mem>,
	VEX, PD, VEX_W;
	defm KMOVD : avx512_mask_mov_gpr<0x92, 0x93, "kmovd", VK32, GR32>,
	VEX, XD;
	defm KMOVQ : avx512_mask_mov<0x90, 0x90, 0x91, "kmovq", VK64, v64i1, i64mem>,
	VEX, PS, VEX_W;
	defm KMOVQ : avx512_mask_mov_gpr<0x92, 0x93, "kmovq", VK64, GR64>,
	VEX, XD, VEX_W;
	}

	// GR from/to mask register
	def : Pat<(v16i1 (bitconvert (i16 GR16:$src))),
	(COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit)), VK16)>;
	def : Pat<(i16 (bitconvert (v16i1 VK16:$src))),
	(EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK16:$src, GR32)), sub_16bit)>;

	def : Pat<(v8i1 (bitconvert (i8 GR8:$src))),
	(COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), GR8:$src, sub_8bit)), VK8)>;
	def : Pat<(i8 (bitconvert (v8i1 VK8:$src))),
	(EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK8:$src, GR32)), sub_8bit)>;

	def : Pat<(i32 (zext (i16 (bitconvert (v16i1 VK16:$src))))),
	(KMOVWrk VK16:$src)>;
	def : Pat<(i32 (anyext (i16 (bitconvert (v16i1 VK16:$src))))),
	(COPY_TO_REGCLASS VK16:$src, GR32)>;

	def : Pat<(i32 (zext (i8 (bitconvert (v8i1 VK8:$src))))),
	(MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK8:$src, GR32)), sub_8bit))>, Requires<[NoDQI]>;
	def : Pat<(i32 (zext (i8 (bitconvert (v8i1 VK8:$src))))),
	(KMOVBrk VK8:$src)>, Requires<[HasDQI]>;
	def : Pat<(i32 (anyext (i8 (bitconvert (v8i1 VK8:$src))))),
	(COPY_TO_REGCLASS VK8:$src, GR32)>;

	def : Pat<(v32i1 (bitconvert (i32 GR32:$src))),
	(COPY_TO_REGCLASS GR32:$src, VK32)>;
	def : Pat<(i32 (bitconvert (v32i1 VK32:$src))),
	(COPY_TO_REGCLASS VK32:$src, GR32)>;
	def : Pat<(v64i1 (bitconvert (i64 GR64:$src))),
	(COPY_TO_REGCLASS GR64:$src, VK64)>;
	def : Pat<(i64 (bitconvert (v64i1 VK64:$src))),
	(COPY_TO_REGCLASS VK64:$src, GR64)>;

	// Load/store kreg
	let Predicates = [HasDQI] in {
	def : Pat<(store (i8 (bitconvert (v8i1 VK8:$src))), addr:$dst),
	(KMOVBmk addr:$dst, VK8:$src)>;
	def : Pat<(v8i1 (bitconvert (i8 (load addr:$src)))),
	(KMOVBkm addr:$src)>;

	def : Pat<(store VK4:$src, addr:$dst),
	(KMOVBmk addr:$dst, (COPY_TO_REGCLASS VK4:$src, VK8))>;
	def : Pat<(store VK2:$src, addr:$dst),
	(KMOVBmk addr:$dst, (COPY_TO_REGCLASS VK2:$src, VK8))>;
	def : Pat<(store VK1:$src, addr:$dst),
	(KMOVBmk addr:$dst, (COPY_TO_REGCLASS VK1:$src, VK8))>;

	def : Pat<(v2i1 (load addr:$src)),
	(COPY_TO_REGCLASS (KMOVBkm addr:$src), VK2)>;
	def : Pat<(v4i1 (load addr:$src)),
	(COPY_TO_REGCLASS (KMOVBkm addr:$src), VK4)>;
	}
	let Predicates = [HasAVX512, NoDQI] in {
	def : Pat<(store VK1:$src, addr:$dst),
	(MOV8mr addr:$dst,
	(i8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK1:$src, GR32)),
	sub_8bit)))>;
	def : Pat<(store VK2:$src, addr:$dst),
	(MOV8mr addr:$dst,
	(i8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK2:$src, GR32)),
	sub_8bit)))>;
	def : Pat<(store VK4:$src, addr:$dst),
	(MOV8mr addr:$dst,
	(i8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK4:$src, GR32)),
	sub_8bit)))>;
	def : Pat<(store VK8:$src, addr:$dst),
	(MOV8mr addr:$dst,
	(i8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK8:$src, GR32)),
	sub_8bit)))>;

	def : Pat<(v8i1 (load addr:$src)),
	(COPY_TO_REGCLASS (MOVZX32rm8 addr:$src), VK8)>;
	def : Pat<(v2i1 (load addr:$src)),
	(COPY_TO_REGCLASS (MOVZX32rm8 addr:$src), VK2)>;
	def : Pat<(v4i1 (load addr:$src)),
	(COPY_TO_REGCLASS (MOVZX32rm8 addr:$src), VK4)>;
	}

	let Predicates = [HasAVX512] in {
	def : Pat<(store (i16 (bitconvert (v16i1 VK16:$src))), addr:$dst),
	(KMOVWmk addr:$dst, VK16:$src)>;
	def : Pat<(v1i1 (load addr:$src)),
	(COPY_TO_REGCLASS (AND32ri8 (MOVZX32rm8 addr:$src), (i32 1)), VK1)>;
	def : Pat<(v16i1 (bitconvert (i16 (load addr:$src)))),
	(KMOVWkm addr:$src)>;
	}
	let Predicates = [HasBWI] in {
	def : Pat<(store (i32 (bitconvert (v32i1 VK32:$src))), addr:$dst),
	(KMOVDmk addr:$dst, VK32:$src)>;
	def : Pat<(v32i1 (bitconvert (i32 (load addr:$src)))),
	(KMOVDkm addr:$src)>;
	def : Pat<(store (i64 (bitconvert (v64i1 VK64:$src))), addr:$dst),
	(KMOVQmk addr:$dst, VK64:$src)>;
	def : Pat<(v64i1 (bitconvert (i64 (load addr:$src)))),
	(KMOVQkm addr:$src)>;
	}

	let Predicates = [HasAVX512] in {
	multiclass operation_gpr_mask_copy_lowering<RegisterClass maskRC, ValueType maskVT> {
	def : Pat<(maskVT (scalar_to_vector GR32:$src)),
	(COPY_TO_REGCLASS GR32:$src, maskRC)>;

	def : Pat<(i32 (X86Vextract maskRC:$src, (iPTR 0))),
	(COPY_TO_REGCLASS maskRC:$src, GR32)>;

	def : Pat<(maskVT (scalar_to_vector GR8:$src)),
	(COPY_TO_REGCLASS (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src, sub_8bit), maskRC)>;

	def : Pat<(i8 (X86Vextract maskRC:$src, (iPTR 0))),
	(EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS maskRC:$src, GR32)), sub_8bit)>;

	def : Pat<(i32 (anyext (i8 (X86Vextract maskRC:$src, (iPTR 0))))),
	(COPY_TO_REGCLASS maskRC:$src, GR32)>;
	}

	defm : operation_gpr_mask_copy_lowering<VK1, v1i1>;
	defm : operation_gpr_mask_copy_lowering<VK2, v2i1>;
	defm : operation_gpr_mask_copy_lowering<VK4, v4i1>;
	defm : operation_gpr_mask_copy_lowering<VK8, v8i1>;
	defm : operation_gpr_mask_copy_lowering<VK16, v16i1>;
	defm : operation_gpr_mask_copy_lowering<VK32, v32i1>;
	defm : operation_gpr_mask_copy_lowering<VK64, v64i1>;

	def : Pat<(X86kshiftr (X86kshiftl (v1i1 (scalar_to_vector GR8:$src)), (i8 15)), (i8 15)) ,
	(COPY_TO_REGCLASS
	(KMOVWkr (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
	GR8:$src, sub_8bit), (i32 1))), VK1)>;
	def : Pat<(X86kshiftr (X86kshiftl (v16i1 (scalar_to_vector GR8:$src)), (i8 15)), (i8 15)) ,
	(COPY_TO_REGCLASS
	(KMOVWkr (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
	GR8:$src, sub_8bit), (i32 1))), VK16)>;
	def : Pat<(X86kshiftr (X86kshiftl (v8i1 (scalar_to_vector GR8:$src)), (i8 15)), (i8 15)) ,
	(COPY_TO_REGCLASS
	(KMOVWkr (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
	GR8:$src, sub_8bit), (i32 1))), VK8)>;

	}

	// Mask unary operation
	// - KNOT
	multiclass avx512_mask_unop<bits<8> opc, string OpcodeStr,
	RegisterClass KRC, SDPatternOperator OpNode,
	Predicate prd> {
	let Predicates = [prd] in
	def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src),
	!strconcat(OpcodeStr, "\t{$src, $dst\|$dst, $src}"),
	[(set KRC:$dst, (OpNode KRC:$src))]>;
	}

	multiclass avx512_mask_unop_all<bits<8> opc, string OpcodeStr,
	SDPatternOperator OpNode> {
	defm B : avx512_mask_unop<opc, !strconcat(OpcodeStr, "b"), VK8, OpNode,
	HasDQI>, VEX, PD;
	defm W : avx512_mask_unop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode,
	HasAVX512>, VEX, PS;
	defm D : avx512_mask_unop<opc, !strconcat(OpcodeStr, "d"), VK32, OpNode,
	HasBWI>, VEX, PD, VEX_W;
	defm Q : avx512_mask_unop<opc, !strconcat(OpcodeStr, "q"), VK64, OpNode,
	HasBWI>, VEX, PS, VEX_W;
	}

	defm KNOT : avx512_mask_unop_all<0x44, "knot", vnot>;

	// KNL does not support KMOVB, 8-bit mask is promoted to 16-bit
	let Predicates = [HasAVX512, NoDQI] in
	def : Pat<(vnot VK8:$src),
	(COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$src, VK16)), VK8)>;

	def : Pat<(vnot VK4:$src),
	(COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK4:$src, VK16)), VK4)>;
	def : Pat<(vnot VK2:$src),
	(COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK2:$src, VK16)), VK2)>;

	// Mask binary operation
	// - KAND, KANDN, KOR, KXNOR, KXOR
	multiclass avx512_mask_binop<bits<8> opc, string OpcodeStr,
	RegisterClass KRC, SDPatternOperator OpNode,
	Predicate prd, bit IsCommutable> {
	let Predicates = [prd], isCommutable = IsCommutable in
	def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src1, KRC:$src2),
	!strconcat(OpcodeStr,
	"\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set KRC:$dst, (OpNode KRC:$src1, KRC:$src2))]>;
	}

	multiclass avx512_mask_binop_all<bits<8> opc, string OpcodeStr,
	SDPatternOperator OpNode, bit IsCommutable,
	Predicate prdW = HasAVX512> {
	defm B : avx512_mask_binop<opc, !strconcat(OpcodeStr, "b"), VK8, OpNode,
	HasDQI, IsCommutable>, VEX_4V, VEX_L, PD;
	defm W : avx512_mask_binop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode,
	prdW, IsCommutable>, VEX_4V, VEX_L, PS;
	defm D : avx512_mask_binop<opc, !strconcat(OpcodeStr, "d"), VK32, OpNode,
	HasBWI, IsCommutable>, VEX_4V, VEX_L, VEX_W, PD;
	defm Q : avx512_mask_binop<opc, !strconcat(OpcodeStr, "q"), VK64, OpNode,
	HasBWI, IsCommutable>, VEX_4V, VEX_L, VEX_W, PS;
	}

	def andn : PatFrag<(ops node:$i0, node:$i1), (and (not node:$i0), node:$i1)>;
	def xnor : PatFrag<(ops node:$i0, node:$i1), (not (xor node:$i0, node:$i1))>;
	// These nodes use 'vnot' instead of 'not' to support vectors.
	def vandn : PatFrag<(ops node:$i0, node:$i1), (and (vnot node:$i0), node:$i1)>;
	def vxnor : PatFrag<(ops node:$i0, node:$i1), (vnot (xor node:$i0, node:$i1))>;

	defm KAND : avx512_mask_binop_all<0x41, "kand", and, 1>;
	defm KOR : avx512_mask_binop_all<0x45, "kor", or, 1>;
	defm KXNOR : avx512_mask_binop_all<0x46, "kxnor", vxnor, 1>;
	defm KXOR : avx512_mask_binop_all<0x47, "kxor", xor, 1>;
	defm KANDN : avx512_mask_binop_all<0x42, "kandn", vandn, 0>;
	defm KADD : avx512_mask_binop_all<0x4A, "kadd", add, 1, HasDQI>;

	multiclass avx512_binop_pat<SDPatternOperator VOpNode, SDPatternOperator OpNode,
	Instruction Inst> {
	// With AVX512F, 8-bit mask is promoted to 16-bit mask,
	// for the DQI set, this type is legal and KxxxB instruction is used
	let Predicates = [NoDQI] in
	def : Pat<(VOpNode VK8:$src1, VK8:$src2),
	(COPY_TO_REGCLASS
	(Inst (COPY_TO_REGCLASS VK8:$src1, VK16),
	(COPY_TO_REGCLASS VK8:$src2, VK16)), VK8)>;

	// All types smaller than 8 bits require conversion anyway
	def : Pat<(OpNode VK1:$src1, VK1:$src2),
	(COPY_TO_REGCLASS (Inst
	(COPY_TO_REGCLASS VK1:$src1, VK16),
	(COPY_TO_REGCLASS VK1:$src2, VK16)), VK1)>;
	def : Pat<(VOpNode VK2:$src1, VK2:$src2),
	(COPY_TO_REGCLASS (Inst
	(COPY_TO_REGCLASS VK2:$src1, VK16),
	(COPY_TO_REGCLASS VK2:$src2, VK16)), VK1)>;
	def : Pat<(VOpNode VK4:$src1, VK4:$src2),
	(COPY_TO_REGCLASS (Inst
	(COPY_TO_REGCLASS VK4:$src1, VK16),
	(COPY_TO_REGCLASS VK4:$src2, VK16)), VK1)>;
	}

	defm : avx512_binop_pat<and, and, KANDWrr>;
	defm : avx512_binop_pat<vandn, andn, KANDNWrr>;
	defm : avx512_binop_pat<or, or, KORWrr>;
	defm : avx512_binop_pat<vxnor, xnor, KXNORWrr>;
	defm : avx512_binop_pat<xor, xor, KXORWrr>;

	// Mask unpacking
	multiclass avx512_mask_unpck<string Suffix,RegisterClass KRC, ValueType VT,
	RegisterClass KRCSrc, Predicate prd> {
	let Predicates = [prd] in {
	let hasSideEffects = 0 in
	def rr : I<0x4b, MRMSrcReg, (outs KRC:$dst),
	(ins KRC:$src1, KRC:$src2),
	"kunpck"#Suffix#"\t{$src2, $src1, $dst\|$dst, $src1, $src2}", []>,
	VEX_4V, VEX_L;

	def : Pat<(VT (concat_vectors KRCSrc:$src1, KRCSrc:$src2)),
	(!cast<Instruction>(NAME##rr)
	(COPY_TO_REGCLASS KRCSrc:$src2, KRC),
	(COPY_TO_REGCLASS KRCSrc:$src1, KRC))>;
	}
	}

	defm KUNPCKBW : avx512_mask_unpck<"bw", VK16, v16i1, VK8, HasAVX512>, PD;
	defm KUNPCKWD : avx512_mask_unpck<"wd", VK32, v32i1, VK16, HasBWI>, PS;
	defm KUNPCKDQ : avx512_mask_unpck<"dq", VK64, v64i1, VK32, HasBWI>, PS, VEX_W;

	// Mask bit testing
	multiclass avx512_mask_testop<bits<8> opc, string OpcodeStr, RegisterClass KRC,
	SDNode OpNode, Predicate prd> {
	let Predicates = [prd], Defs = [EFLAGS] in
	def rr : I<opc, MRMSrcReg, (outs), (ins KRC:$src1, KRC:$src2),
	!strconcat(OpcodeStr, "\t{$src2, $src1\|$src1, $src2}"),
	[(set EFLAGS, (OpNode KRC:$src1, KRC:$src2))]>;
	}

	multiclass avx512_mask_testop_w<bits<8> opc, string OpcodeStr, SDNode OpNode,
	Predicate prdW = HasAVX512> {
	defm B : avx512_mask_testop<opc, OpcodeStr#"b", VK8, OpNode, HasDQI>,
	VEX, PD;
	defm W : avx512_mask_testop<opc, OpcodeStr#"w", VK16, OpNode, prdW>,
	VEX, PS;
	defm Q : avx512_mask_testop<opc, OpcodeStr#"q", VK64, OpNode, HasBWI>,
	VEX, PS, VEX_W;
	defm D : avx512_mask_testop<opc, OpcodeStr#"d", VK32, OpNode, HasBWI>,
	VEX, PD, VEX_W;
	}

	defm KORTEST : avx512_mask_testop_w<0x98, "kortest", X86kortest>;
	defm KTEST : avx512_mask_testop_w<0x99, "ktest", X86ktest, HasDQI>;

	// Mask shift
	multiclass avx512_mask_shiftop<bits<8> opc, string OpcodeStr, RegisterClass KRC,
	SDNode OpNode> {
	let Predicates = [HasAVX512] in
	def ri : Ii8<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src, u8imm:$imm),
	!strconcat(OpcodeStr,
	"\t{$imm, $src, $dst\|$dst, $src, $imm}"),
	[(set KRC:$dst, (OpNode KRC:$src, (i8 imm:$imm)))]>;
	}

	multiclass avx512_mask_shiftop_w<bits<8> opc1, bits<8> opc2, string OpcodeStr,
	SDNode OpNode> {
	defm W : avx512_mask_shiftop<opc1, !strconcat(OpcodeStr, "w"), VK16, OpNode>,
	VEX, TAPD, VEX_W;
	let Predicates = [HasDQI] in
	defm B : avx512_mask_shiftop<opc1, !strconcat(OpcodeStr, "b"), VK8, OpNode>,
	VEX, TAPD;
	let Predicates = [HasBWI] in {
	defm Q : avx512_mask_shiftop<opc2, !strconcat(OpcodeStr, "q"), VK64, OpNode>,
	VEX, TAPD, VEX_W;
	defm D : avx512_mask_shiftop<opc2, !strconcat(OpcodeStr, "d"), VK32, OpNode>,
	VEX, TAPD;
	}
	}

	defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", X86kshiftl>;
	defm KSHIFTR : avx512_mask_shiftop_w<0x30, 0x31, "kshiftr", X86kshiftr>;

	multiclass axv512_icmp_packed_no_vlx_lowering<SDNode OpNode, string InstStr> {
	def : Pat<(v8i1 (OpNode (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
	(COPY_TO_REGCLASS (!cast<Instruction>(InstStr##Zrr)
	(v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
	(v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), VK8)>;

	def : Pat<(insert_subvector (v16i1 immAllZerosV),
	(v8i1 (OpNode (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
	(i64 0)),
	(KSHIFTRWri (KSHIFTLWri (!cast<Instruction>(InstStr##Zrr)
	(v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
	(v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))),
	(i8 8)), (i8 8))>;

	def : Pat<(insert_subvector (v16i1 immAllZerosV),
	(v8i1 (and VK8:$mask,
	(OpNode (v8i32 VR256X:$src1), (v8i32 VR256X:$src2)))),
	(i64 0)),
	(KSHIFTRWri (KSHIFTLWri (!cast<Instruction>(InstStr##Zrrk)
	(COPY_TO_REGCLASS VK8:$mask, VK16),
	(v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
	(v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))),
	(i8 8)), (i8 8))>;
	}

	multiclass axv512_icmp_packed_cc_no_vlx_lowering<SDNode OpNode, string InstStr,
	AVX512VLVectorVTInfo _> {
	def : Pat<(v8i1 (OpNode (_.info256.VT VR256X:$src1), (_.info256.VT VR256X:$src2), imm:$cc)),
	(COPY_TO_REGCLASS (!cast<Instruction>(InstStr##Zrri)
	(_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
	(_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)),
	imm:$cc), VK8)>;

	def : Pat<(insert_subvector (v16i1 immAllZerosV),
	(v8i1 (OpNode (_.info256.VT VR256X:$src1), (_.info256.VT VR256X:$src2), imm:$cc)),
	(i64 0)),
	(KSHIFTRWri (KSHIFTLWri (!cast<Instruction>(InstStr##Zrri)
	(_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
	(_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)),
	imm:$cc),
	(i8 8)), (i8 8))>;

	def : Pat<(insert_subvector (v16i1 immAllZerosV),
	(v8i1 (and VK8:$mask,
	(OpNode (_.info256.VT VR256X:$src1), (_.info256.VT VR256X:$src2), imm:$cc))),
	(i64 0)),
	(KSHIFTRWri (KSHIFTLWri (!cast<Instruction>(InstStr##Zrrik)
	(COPY_TO_REGCLASS VK8:$mask, VK16),
	(_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
	(_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)),
	imm:$cc),
	(i8 8)), (i8 8))>;
	}

	let Predicates = [HasAVX512, NoVLX] in {
	defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTD">;
	defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm, "VPCMPEQD">;

	defm : axv512_icmp_packed_cc_no_vlx_lowering<X86cmpm, "VCMPPS", avx512vl_f32_info>;
	defm : axv512_icmp_packed_cc_no_vlx_lowering<X86cmpm, "VPCMPD", avx512vl_i32_info>;
	defm : axv512_icmp_packed_cc_no_vlx_lowering<X86cmpmu, "VPCMPUD", avx512vl_i32_info>;
	}

	// Mask setting all 0s or 1s
	multiclass avx512_mask_setop<RegisterClass KRC, ValueType VT, PatFrag Val> {
	let Predicates = [HasAVX512] in
	let isReMaterializable = 1, isAsCheapAsAMove = 1, isPseudo = 1 in
	def #NAME# : I<0, Pseudo, (outs KRC:$dst), (ins), "",
	[(set KRC:$dst, (VT Val))]>;
	}

	multiclass avx512_mask_setop_w<PatFrag Val> {
	defm W : avx512_mask_setop<VK16, v16i1, Val>;
	defm D : avx512_mask_setop<VK32, v32i1, Val>;
	defm Q : avx512_mask_setop<VK64, v64i1, Val>;
	}

	defm KSET0 : avx512_mask_setop_w<immAllZerosV>;
	defm KSET1 : avx512_mask_setop_w<immAllOnesV>;

	// With AVX-512 only, 8-bit mask is promoted to 16-bit mask.
	let Predicates = [HasAVX512] in {
	def : Pat<(v8i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK8)>;
	def : Pat<(v4i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK4)>;
	def : Pat<(v2i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK2)>;
	def : Pat<(v1i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK1)>;
	def : Pat<(v8i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK8)>;
	def : Pat<(v4i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK4)>;
	def : Pat<(v2i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK2)>;
	def : Pat<(v1i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK1)>;
	}

	// Patterns for kmask insert_subvector/extract_subvector to/from index=0
	multiclass operation_subvector_mask_lowering<RegisterClass subRC, ValueType subVT,
	RegisterClass RC, ValueType VT> {
	def : Pat<(subVT (extract_subvector (VT RC:$src), (iPTR 0))),
	(subVT (COPY_TO_REGCLASS RC:$src, subRC))>;

	def : Pat<(VT (insert_subvector undef, subRC:$src, (iPTR 0))),
	(VT (COPY_TO_REGCLASS subRC:$src, RC))>;
	}
	defm : operation_subvector_mask_lowering<VK1, v1i1, VK2, v2i1>;
	defm : operation_subvector_mask_lowering<VK1, v1i1, VK4, v4i1>;
	defm : operation_subvector_mask_lowering<VK1, v1i1, VK8, v8i1>;
	defm : operation_subvector_mask_lowering<VK1, v1i1, VK16, v16i1>;
	defm : operation_subvector_mask_lowering<VK1, v1i1, VK32, v32i1>;
	defm : operation_subvector_mask_lowering<VK1, v1i1, VK64, v64i1>;

	defm : operation_subvector_mask_lowering<VK2, v2i1, VK4, v4i1>;
	defm : operation_subvector_mask_lowering<VK2, v2i1, VK8, v8i1>;
	defm : operation_subvector_mask_lowering<VK2, v2i1, VK16, v16i1>;
	defm : operation_subvector_mask_lowering<VK2, v2i1, VK32, v32i1>;
	defm : operation_subvector_mask_lowering<VK2, v2i1, VK64, v64i1>;

	defm : operation_subvector_mask_lowering<VK4, v4i1, VK8, v8i1>;
	defm : operation_subvector_mask_lowering<VK4, v4i1, VK16, v16i1>;
	defm : operation_subvector_mask_lowering<VK4, v4i1, VK32, v32i1>;
	defm : operation_subvector_mask_lowering<VK4, v4i1, VK64, v64i1>;

	defm : operation_subvector_mask_lowering<VK8, v8i1, VK16, v16i1>;
	defm : operation_subvector_mask_lowering<VK8, v8i1, VK32, v32i1>;
	defm : operation_subvector_mask_lowering<VK8, v8i1, VK64, v64i1>;

	defm : operation_subvector_mask_lowering<VK16, v16i1, VK32, v32i1>;
	defm : operation_subvector_mask_lowering<VK16, v16i1, VK64, v64i1>;

	defm : operation_subvector_mask_lowering<VK32, v32i1, VK64, v64i1>;

	def : Pat<(v2i1 (extract_subvector (v4i1 VK4:$src), (iPTR 2))),
	(v2i1 (COPY_TO_REGCLASS
	(KSHIFTRWri (COPY_TO_REGCLASS VK4:$src, VK16), (i8 2)),
	VK2))>;
	def : Pat<(v4i1 (extract_subvector (v8i1 VK8:$src), (iPTR 4))),
	(v4i1 (COPY_TO_REGCLASS
	(KSHIFTRWri (COPY_TO_REGCLASS VK8:$src, VK16), (i8 4)),
	VK4))>;
	def : Pat<(v8i1 (extract_subvector (v16i1 VK16:$src), (iPTR 8))),
	(v8i1 (COPY_TO_REGCLASS (KSHIFTRWri VK16:$src, (i8 8)), VK8))>;
	def : Pat<(v16i1 (extract_subvector (v32i1 VK32:$src), (iPTR 16))),
	(v16i1 (COPY_TO_REGCLASS (KSHIFTRDri VK32:$src, (i8 16)), VK16))>;
	def : Pat<(v32i1 (extract_subvector (v64i1 VK64:$src), (iPTR 32))),
	(v32i1 (COPY_TO_REGCLASS (KSHIFTRQri VK64:$src, (i8 32)), VK32))>;


	// Patterns for kmask shift
	multiclass mask_shift_lowering<RegisterClass RC, ValueType VT> {
	def : Pat<(VT (X86kshiftl RC:$src, (i8 imm:$imm))),
	(VT (COPY_TO_REGCLASS
	(KSHIFTLWri (COPY_TO_REGCLASS RC:$src, VK16),
	(I8Imm $imm)),
	RC))>;
	def : Pat<(VT (X86kshiftr RC:$src, (i8 imm:$imm))),
	(VT (COPY_TO_REGCLASS
	(KSHIFTRWri (COPY_TO_REGCLASS RC:$src, VK16),
	(I8Imm $imm)),
	RC))>;
	}

	defm : mask_shift_lowering<VK8, v8i1>, Requires<[HasAVX512, NoDQI]>;
	defm : mask_shift_lowering<VK4, v4i1>, Requires<[HasAVX512]>;
	defm : mask_shift_lowering<VK2, v2i1>, Requires<[HasAVX512]>;
	//===----------------------------------------------------------------------===//
	// AVX-512 - Aligned and unaligned load and store
	//


	multiclass avx512_load<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
	PatFrag ld_frag, PatFrag mload,
	SDPatternOperator SelectOprr = vselect> {
	let hasSideEffects = 0 in {
	def rr : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst), (ins _.RC:$src),
	!strconcat(OpcodeStr, "\t{$src, $dst\|$dst, $src}"), [],
	_.ExeDomain>, EVEX;
	def rrkz : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst),
	(ins _.KRCWM:$mask, _.RC:$src),
	!strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}\|",
	"${dst} {${mask}} {z}, $src}"),
	[(set _.RC:$dst, (_.VT (SelectOprr _.KRCWM:$mask,
	(_.VT _.RC:$src),
	_.ImmAllZerosV)))], _.ExeDomain>,
	EVEX, EVEX_KZ;

	let canFoldAsLoad = 1, isReMaterializable = 1,
	SchedRW = [WriteLoad] in
	def rm : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst), (ins _.MemOp:$src),
	!strconcat(OpcodeStr, "\t{$src, $dst\|$dst, $src}"),
	[(set _.RC:$dst, (_.VT (bitconvert (ld_frag addr:$src))))],
	_.ExeDomain>, EVEX;

	let Constraints = "$src0 = $dst", isConvertibleToThreeAddress = 1 in {
	def rrk : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst),
	(ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src1),
	!strconcat(OpcodeStr, "\t{$src1, ${dst} {${mask}}\|",
	"${dst} {${mask}}, $src1}"),
	[(set _.RC:$dst, (_.VT (SelectOprr _.KRCWM:$mask,
	(_.VT _.RC:$src1),
	(_.VT _.RC:$src0))))], _.ExeDomain>,
	EVEX, EVEX_K;
	let SchedRW = [WriteLoad] in
	def rmk : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst),
	(ins _.RC:$src0, _.KRCWM:$mask, _.MemOp:$src1),
	!strconcat(OpcodeStr, "\t{$src1, ${dst} {${mask}}\|",
	"${dst} {${mask}}, $src1}"),
	[(set _.RC:$dst, (_.VT
	(vselect _.KRCWM:$mask,
	(_.VT (bitconvert (ld_frag addr:$src1))),
	(_.VT _.RC:$src0))))], _.ExeDomain>, EVEX, EVEX_K;
	}
	let SchedRW = [WriteLoad] in
	def rmkz : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst),
	(ins _.KRCWM:$mask, _.MemOp:$src),
	OpcodeStr #"\t{$src, ${dst} {${mask}} {z}\|"#
	"${dst} {${mask}} {z}, $src}",
	[(set _.RC:$dst, (_.VT (vselect _.KRCWM:$mask,
	(_.VT (bitconvert (ld_frag addr:$src))), _.ImmAllZerosV)))],
	_.ExeDomain>, EVEX, EVEX_KZ;
	}
	def : Pat<(_.VT (mload addr:$ptr, _.KRCWM:$mask, undef)),
	(!cast<Instruction>(NAME#_.ZSuffix##rmkz) _.KRCWM:$mask, addr:$ptr)>;

	def : Pat<(_.VT (mload addr:$ptr, _.KRCWM:$mask, _.ImmAllZerosV)),
	(!cast<Instruction>(NAME#_.ZSuffix##rmkz) _.KRCWM:$mask, addr:$ptr)>;

	def : Pat<(_.VT (mload addr:$ptr, _.KRCWM:$mask, (_.VT _.RC:$src0))),
	(!cast<Instruction>(NAME#_.ZSuffix##rmk) _.RC:$src0,
	_.KRCWM:$mask, addr:$ptr)>;
	}

	multiclass avx512_alignedload_vl<bits<8> opc, string OpcodeStr,
	AVX512VLVectorVTInfo _,
	Predicate prd> {
	let Predicates = [prd] in
	defm Z : avx512_load<opc, OpcodeStr, _.info512, _.info512.AlignedLdFrag,
	masked_load_aligned512>, EVEX_V512;

	let Predicates = [prd, HasVLX] in {
	defm Z256 : avx512_load<opc, OpcodeStr, _.info256, _.info256.AlignedLdFrag,
	masked_load_aligned256>, EVEX_V256;
	defm Z128 : avx512_load<opc, OpcodeStr, _.info128, _.info128.AlignedLdFrag,
	masked_load_aligned128>, EVEX_V128;
	}
	}

	multiclass avx512_load_vl<bits<8> opc, string OpcodeStr,
	AVX512VLVectorVTInfo _,
	Predicate prd,
	SDPatternOperator SelectOprr = vselect> {
	let Predicates = [prd] in
	defm Z : avx512_load<opc, OpcodeStr, _.info512, _.info512.LdFrag,
	masked_load_unaligned, SelectOprr>, EVEX_V512;

	let Predicates = [prd, HasVLX] in {
	defm Z256 : avx512_load<opc, OpcodeStr, _.info256, _.info256.LdFrag,
	masked_load_unaligned, SelectOprr>, EVEX_V256;
	defm Z128 : avx512_load<opc, OpcodeStr, _.info128, _.info128.LdFrag,
	masked_load_unaligned, SelectOprr>, EVEX_V128;
	}
	}

	multiclass avx512_store<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
	PatFrag st_frag, PatFrag mstore, string Name> {

	let hasSideEffects = 0 in {
	def rr_REV : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), (ins _.RC:$src),
	OpcodeStr # ".s\t{$src, $dst\|$dst, $src}",
	[], _.ExeDomain>, EVEX, FoldGenData<Name#rr>;
	def rrk_REV : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst),
	(ins _.KRCWM:$mask, _.RC:$src),
	OpcodeStr # ".s\t{$src, ${dst} {${mask}}\|"#
	"${dst} {${mask}}, $src}",
	[], _.ExeDomain>, EVEX, EVEX_K, FoldGenData<Name#rrk>;
	def rrkz_REV : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst),
	(ins _.KRCWM:$mask, _.RC:$src),
	OpcodeStr # ".s\t{$src, ${dst} {${mask}} {z}\|" #
	"${dst} {${mask}} {z}, $src}",
	[], _.ExeDomain>, EVEX, EVEX_KZ, FoldGenData<Name#rrkz>;
	}

	def mr : AVX512PI<opc, MRMDestMem, (outs), (ins _.MemOp:$dst, _.RC:$src),
	!strconcat(OpcodeStr, "\t{$src, $dst\|$dst, $src}"),
	[(st_frag (_.VT _.RC:$src), addr:$dst)], _.ExeDomain>, EVEX;
	def mrk : AVX512PI<opc, MRMDestMem, (outs),
	(ins _.MemOp:$dst, _.KRCWM:$mask, _.RC:$src),
	OpcodeStr # "\t{$src, ${dst} {${mask}}\|${dst} {${mask}}, $src}",
	[], _.ExeDomain>, EVEX, EVEX_K;

	def: Pat<(mstore addr:$ptr, _.KRCWM:$mask, (_.VT _.RC:$src)),
	(!cast<Instruction>(NAME#_.ZSuffix##mrk) addr:$ptr,
	_.KRCWM:$mask, _.RC:$src)>;
	}


	multiclass avx512_store_vl< bits<8> opc, string OpcodeStr,
	AVX512VLVectorVTInfo _, Predicate prd,
	string Name> {
	let Predicates = [prd] in
	defm Z : avx512_store<opc, OpcodeStr, _.info512, store,
	masked_store_unaligned, Name#Z>, EVEX_V512;

	let Predicates = [prd, HasVLX] in {
	defm Z256 : avx512_store<opc, OpcodeStr, _.info256, store,
	masked_store_unaligned, Name#Z256>, EVEX_V256;
	defm Z128 : avx512_store<opc, OpcodeStr, _.info128, store,
	masked_store_unaligned, Name#Z128>, EVEX_V128;
	}
	}

	multiclass avx512_alignedstore_vl<bits<8> opc, string OpcodeStr,
	AVX512VLVectorVTInfo _, Predicate prd,
	string Name> {
	let Predicates = [prd] in
	defm Z : avx512_store<opc, OpcodeStr, _.info512, alignedstore512,
	masked_store_aligned512, Name#Z>, EVEX_V512;

	let Predicates = [prd, HasVLX] in {
	defm Z256 : avx512_store<opc, OpcodeStr, _.info256, alignedstore256,
	masked_store_aligned256, Name#Z256>, EVEX_V256;
	defm Z128 : avx512_store<opc, OpcodeStr, _.info128, alignedstore,
	masked_store_aligned128, Name#Z128>, EVEX_V128;
	}
	}

	defm VMOVAPS : avx512_alignedload_vl<0x28, "vmovaps", avx512vl_f32_info,
	HasAVX512>,
	avx512_alignedstore_vl<0x29, "vmovaps", avx512vl_f32_info,
	HasAVX512, "VMOVAPS">,
	PS, EVEX_CD8<32, CD8VF>;

	defm VMOVAPD : avx512_alignedload_vl<0x28, "vmovapd", avx512vl_f64_info,
	HasAVX512>,
	avx512_alignedstore_vl<0x29, "vmovapd", avx512vl_f64_info,
	HasAVX512, "VMOVAPD">,
	PD, VEX_W, EVEX_CD8<64, CD8VF>;

	defm VMOVUPS : avx512_load_vl<0x10, "vmovups", avx512vl_f32_info, HasAVX512,
	null_frag>,
	avx512_store_vl<0x11, "vmovups", avx512vl_f32_info, HasAVX512,
	"VMOVUPS">,
	PS, EVEX_CD8<32, CD8VF>;

	defm VMOVUPD : avx512_load_vl<0x10, "vmovupd", avx512vl_f64_info, HasAVX512,
	null_frag>,
	avx512_store_vl<0x11, "vmovupd", avx512vl_f64_info, HasAVX512,
	"VMOVUPD">,
	PD, VEX_W, EVEX_CD8<64, CD8VF>;

	defm VMOVDQA32 : avx512_alignedload_vl<0x6F, "vmovdqa32", avx512vl_i32_info,
	HasAVX512>,
	avx512_alignedstore_vl<0x7F, "vmovdqa32", avx512vl_i32_info,
	HasAVX512, "VMOVDQA32">,
	PD, EVEX_CD8<32, CD8VF>;

	defm VMOVDQA64 : avx512_alignedload_vl<0x6F, "vmovdqa64", avx512vl_i64_info,
	HasAVX512>,
	avx512_alignedstore_vl<0x7F, "vmovdqa64", avx512vl_i64_info,
	HasAVX512, "VMOVDQA64">,
	PD, VEX_W, EVEX_CD8<64, CD8VF>;

	defm VMOVDQU8 : avx512_load_vl<0x6F, "vmovdqu8", avx512vl_i8_info, HasBWI>,
	avx512_store_vl<0x7F, "vmovdqu8", avx512vl_i8_info,
	HasBWI, "VMOVDQU8">,
	XD, EVEX_CD8<8, CD8VF>;

	defm VMOVDQU16 : avx512_load_vl<0x6F, "vmovdqu16", avx512vl_i16_info, HasBWI>,
	avx512_store_vl<0x7F, "vmovdqu16", avx512vl_i16_info,
	HasBWI, "VMOVDQU16">,
	XD, VEX_W, EVEX_CD8<16, CD8VF>;

	defm VMOVDQU32 : avx512_load_vl<0x6F, "vmovdqu32", avx512vl_i32_info, HasAVX512,
	null_frag>,
	avx512_store_vl<0x7F, "vmovdqu32", avx512vl_i32_info,
	HasAVX512, "VMOVDQU32">,
	XS, EVEX_CD8<32, CD8VF>;

	defm VMOVDQU64 : avx512_load_vl<0x6F, "vmovdqu64", avx512vl_i64_info, HasAVX512,
	null_frag>,
	avx512_store_vl<0x7F, "vmovdqu64", avx512vl_i64_info,
	HasAVX512, "VMOVDQU64">,
	XS, VEX_W, EVEX_CD8<64, CD8VF>;

	// Special instructions to help with spilling when we don't have VLX. We need
	// to load or store from a ZMM register instead. These are converted in
	// expandPostRAPseudos.
	let isReMaterializable = 1, canFoldAsLoad = 1,
	isPseudo = 1, SchedRW = [WriteLoad], mayLoad = 1, hasSideEffects = 0 in {
	def VMOVAPSZ128rm_NOVLX : I<0, Pseudo, (outs VR128X:$dst), (ins f128mem:$src),
	"", []>;
	def VMOVAPSZ256rm_NOVLX : I<0, Pseudo, (outs VR256X:$dst), (ins f256mem:$src),
	"", []>;
	def VMOVUPSZ128rm_NOVLX : I<0, Pseudo, (outs VR128X:$dst), (ins f128mem:$src),
	"", []>;
	def VMOVUPSZ256rm_NOVLX : I<0, Pseudo, (outs VR256X:$dst), (ins f256mem:$src),
	"", []>;
	}

	let isPseudo = 1, mayStore = 1, hasSideEffects = 0 in {
	def VMOVAPSZ128mr_NOVLX : I<0, Pseudo, (outs), (ins f128mem:$dst, VR128X:$src),
	"", []>;
	def VMOVAPSZ256mr_NOVLX : I<0, Pseudo, (outs), (ins f256mem:$dst, VR256X:$src),
	"", []>;
	def VMOVUPSZ128mr_NOVLX : I<0, Pseudo, (outs), (ins f128mem:$dst, VR128X:$src),
	"", []>;
	def VMOVUPSZ256mr_NOVLX : I<0, Pseudo, (outs), (ins f256mem:$dst, VR256X:$src),
	"", []>;
	}

	def : Pat<(v8i64 (vselect VK8WM:$mask, (bc_v8i64 (v16i32 immAllZerosV)),
	(v8i64 VR512:$src))),
	(VMOVDQA64Zrrkz (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$mask, VK16)),
	VK8), VR512:$src)>;

	def : Pat<(v16i32 (vselect VK16WM:$mask, (v16i32 immAllZerosV),
	(v16i32 VR512:$src))),
	(VMOVDQA32Zrrkz (KNOTWrr VK16WM:$mask), VR512:$src)>;

	// These patterns exist to prevent the above patterns from introducing a second
	// mask inversion when one already exists.
	def : Pat<(v8i64 (vselect (xor VK8:$mask, (v8i1 immAllOnesV)),
	(bc_v8i64 (v16i32 immAllZerosV)),
	(v8i64 VR512:$src))),
	(VMOVDQA64Zrrkz VK8:$mask, VR512:$src)>;
	def : Pat<(v16i32 (vselect (xor VK16:$mask, (v16i1 immAllOnesV)),
	(v16i32 immAllZerosV),
	(v16i32 VR512:$src))),
	(VMOVDQA32Zrrkz VK16WM:$mask, VR512:$src)>;

	// Patterns for handling v8i1 selects of 256-bit vectors when VLX isn't
	// available. Use a 512-bit operation and extract.
	let Predicates = [HasAVX512, NoVLX] in {
	def : Pat<(v8f32 (vselect (v8i1 VK8WM:$mask), (v8f32 VR256X:$src1),
	(v8f32 VR256X:$src0))),
	(EXTRACT_SUBREG
	(v16f32
	(VMOVAPSZrrk
	(v16f32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src0, sub_ymm)),
	(COPY_TO_REGCLASS VK8WM:$mask, VK16WM),
	(v16f32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)))),
	sub_ymm)>;

	def : Pat<(v8i32 (vselect (v8i1 VK8WM:$mask), (v8i32 VR256X:$src1),
	(v8i32 VR256X:$src0))),
	(EXTRACT_SUBREG
	(v16i32
	(VMOVDQA32Zrrk
	(v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src0, sub_ymm)),
	(COPY_TO_REGCLASS VK8WM:$mask, VK16WM),
	(v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)))),
	sub_ymm)>;
	}

	let Predicates = [HasVLX, NoBWI] in {
	// 128-bit load/store without BWI.
	def : Pat<(alignedstore (v8i16 VR128X:$src), addr:$dst),
	(VMOVDQA32Z128mr addr:$dst, VR128X:$src)>;
	def : Pat<(alignedstore (v16i8 VR128X:$src), addr:$dst),
	(VMOVDQA32Z128mr addr:$dst, VR128X:$src)>;
	def : Pat<(store (v8i16 VR128X:$src), addr:$dst),
	(VMOVDQU32Z128mr addr:$dst, VR128X:$src)>;
	def : Pat<(store (v16i8 VR128X:$src), addr:$dst),
	(VMOVDQU32Z128mr addr:$dst, VR128X:$src)>;

	// 256-bit load/store without BWI.
	def : Pat<(alignedstore256 (v16i16 VR256X:$src), addr:$dst),
	(VMOVDQA32Z256mr addr:$dst, VR256X:$src)>;
	def : Pat<(alignedstore256 (v32i8 VR256X:$src), addr:$dst),
	(VMOVDQA32Z256mr addr:$dst, VR256X:$src)>;
	def : Pat<(store (v16i16 VR256X:$src), addr:$dst),
	(VMOVDQU32Z256mr addr:$dst, VR256X:$src)>;
	def : Pat<(store (v32i8 VR256X:$src), addr:$dst),
	(VMOVDQU32Z256mr addr:$dst, VR256X:$src)>;
	}

	let Predicates = [HasVLX] in {
	// Special patterns for storing subvector extracts of lower 128-bits of 256.
	// Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr
	def : Pat<(alignedstore (v2f64 (extract_subvector
	(v4f64 VR256X:$src), (iPTR 0))), addr:$dst),
	(VMOVAPDZ128mr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
	def : Pat<(alignedstore (v4f32 (extract_subvector
	(v8f32 VR256X:$src), (iPTR 0))), addr:$dst),
	(VMOVAPSZ128mr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
	def : Pat<(alignedstore (v2i64 (extract_subvector
	(v4i64 VR256X:$src), (iPTR 0))), addr:$dst),
	(VMOVDQA64Z128mr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
	def : Pat<(alignedstore (v4i32 (extract_subvector
	(v8i32 VR256X:$src), (iPTR 0))), addr:$dst),
	(VMOVDQA32Z128mr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
	def : Pat<(alignedstore (v8i16 (extract_subvector
	(v16i16 VR256X:$src), (iPTR 0))), addr:$dst),
	(VMOVDQA32Z128mr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
	def : Pat<(alignedstore (v16i8 (extract_subvector
	(v32i8 VR256X:$src), (iPTR 0))), addr:$dst),
	(VMOVDQA32Z128mr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;

	def : Pat<(store (v2f64 (extract_subvector
	(v4f64 VR256X:$src), (iPTR 0))), addr:$dst),
	(VMOVUPDZ128mr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
	def : Pat<(store (v4f32 (extract_subvector
	(v8f32 VR256X:$src), (iPTR 0))), addr:$dst),
	(VMOVUPSZ128mr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
	def : Pat<(store (v2i64 (extract_subvector
	(v4i64 VR256X:$src), (iPTR 0))), addr:$dst),
	(VMOVDQU64Z128mr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
	def : Pat<(store (v4i32 (extract_subvector
	(v8i32 VR256X:$src), (iPTR 0))), addr:$dst),
	(VMOVDQU32Z128mr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
	def : Pat<(store (v8i16 (extract_subvector
	(v16i16 VR256X:$src), (iPTR 0))), addr:$dst),
	(VMOVDQU32Z128mr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
	def : Pat<(store (v16i8 (extract_subvector
	(v32i8 VR256X:$src), (iPTR 0))), addr:$dst),
	(VMOVDQU32Z128mr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;

	// Special patterns for storing subvector extracts of lower 128-bits of 512.
	// Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr
	def : Pat<(alignedstore (v2f64 (extract_subvector
	(v8f64 VR512:$src), (iPTR 0))), addr:$dst),
	(VMOVAPDZ128mr addr:$dst, (v2f64 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;
	def : Pat<(alignedstore (v4f32 (extract_subvector
	(v16f32 VR512:$src), (iPTR 0))), addr:$dst),
	(VMOVAPSZ128mr addr:$dst, (v4f32 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;
	def : Pat<(alignedstore (v2i64 (extract_subvector
	(v8i64 VR512:$src), (iPTR 0))), addr:$dst),
	(VMOVDQA64Z128mr addr:$dst, (v2i64 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;
	def : Pat<(alignedstore (v4i32 (extract_subvector
	(v16i32 VR512:$src), (iPTR 0))), addr:$dst),
	(VMOVDQA32Z128mr addr:$dst, (v4i32 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;
	def : Pat<(alignedstore (v8i16 (extract_subvector
	(v32i16 VR512:$src), (iPTR 0))), addr:$dst),
	(VMOVDQA32Z128mr addr:$dst, (v8i16 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;
	def : Pat<(alignedstore (v16i8 (extract_subvector
	(v64i8 VR512:$src), (iPTR 0))), addr:$dst),
	(VMOVDQA32Z128mr addr:$dst, (v16i8 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;

	def : Pat<(store (v2f64 (extract_subvector
	(v8f64 VR512:$src), (iPTR 0))), addr:$dst),
	(VMOVUPDZ128mr addr:$dst, (v2f64 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;
	def : Pat<(store (v4f32 (extract_subvector
	(v16f32 VR512:$src), (iPTR 0))), addr:$dst),
	(VMOVUPSZ128mr addr:$dst, (v4f32 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;
	def : Pat<(store (v2i64 (extract_subvector
	(v8i64 VR512:$src), (iPTR 0))), addr:$dst),
	(VMOVDQU64Z128mr addr:$dst, (v2i64 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;
	def : Pat<(store (v4i32 (extract_subvector
	(v16i32 VR512:$src), (iPTR 0))), addr:$dst),
	(VMOVDQU32Z128mr addr:$dst, (v4i32 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;
	def : Pat<(store (v8i16 (extract_subvector
	(v32i16 VR512:$src), (iPTR 0))), addr:$dst),
	(VMOVDQU32Z128mr addr:$dst, (v8i16 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;
	def : Pat<(store (v16i8 (extract_subvector
	(v64i8 VR512:$src), (iPTR 0))), addr:$dst),
	(VMOVDQU32Z128mr addr:$dst, (v16i8 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;

	// Special patterns for storing subvector extracts of lower 256-bits of 512.
	// Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr
	def : Pat<(alignedstore256 (v4f64 (extract_subvector
	(v8f64 VR512:$src), (iPTR 0))), addr:$dst),
	(VMOVAPDZ256mr addr:$dst, (v4f64 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
	def : Pat<(alignedstore (v8f32 (extract_subvector
	(v16f32 VR512:$src), (iPTR 0))), addr:$dst),
	(VMOVAPSZ256mr addr:$dst, (v8f32 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
	def : Pat<(alignedstore256 (v4i64 (extract_subvector
	(v8i64 VR512:$src), (iPTR 0))), addr:$dst),
	(VMOVDQA64Z256mr addr:$dst, (v4i64 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
	def : Pat<(alignedstore256 (v8i32 (extract_subvector
	(v16i32 VR512:$src), (iPTR 0))), addr:$dst),
	(VMOVDQA32Z256mr addr:$dst, (v8i32 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
	def : Pat<(alignedstore256 (v16i16 (extract_subvector
	(v32i16 VR512:$src), (iPTR 0))), addr:$dst),
	(VMOVDQA32Z256mr addr:$dst, (v16i16 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
	def : Pat<(alignedstore256 (v32i8 (extract_subvector
	(v64i8 VR512:$src), (iPTR 0))), addr:$dst),
	(VMOVDQA32Z256mr addr:$dst, (v32i8 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;

	def : Pat<(store (v4f64 (extract_subvector
	(v8f64 VR512:$src), (iPTR 0))), addr:$dst),
	(VMOVUPDZ256mr addr:$dst, (v4f64 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
	def : Pat<(store (v8f32 (extract_subvector
	(v16f32 VR512:$src), (iPTR 0))), addr:$dst),
	(VMOVUPSZ256mr addr:$dst, (v8f32 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
	def : Pat<(store (v4i64 (extract_subvector
	(v8i64 VR512:$src), (iPTR 0))), addr:$dst),
	(VMOVDQU64Z256mr addr:$dst, (v4i64 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
	def : Pat<(store (v8i32 (extract_subvector
	(v16i32 VR512:$src), (iPTR 0))), addr:$dst),
	(VMOVDQU32Z256mr addr:$dst, (v8i32 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
	def : Pat<(store (v16i16 (extract_subvector
	(v32i16 VR512:$src), (iPTR 0))), addr:$dst),
	(VMOVDQU32Z256mr addr:$dst, (v16i16 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
	def : Pat<(store (v32i8 (extract_subvector
	(v64i8 VR512:$src), (iPTR 0))), addr:$dst),
	(VMOVDQU32Z256mr addr:$dst, (v32i8 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
	}


	// Move Int Doubleword to Packed Double Int
	//
	let ExeDomain = SSEPackedInt in {
	def VMOVDI2PDIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR32:$src),
	"vmovd\t{$src, $dst\|$dst, $src}",
	[(set VR128X:$dst,
	(v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
	EVEX;
	def VMOVDI2PDIZrm : AVX512BI<0x6E, MRMSrcMem, (outs VR128X:$dst), (ins i32mem:$src),
	"vmovd\t{$src, $dst\|$dst, $src}",
	[(set VR128X:$dst,
	(v4i32 (scalar_to_vector (loadi32 addr:$src))))],
	IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>;
	def VMOV64toPQIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR64:$src),
	"vmovq\t{$src, $dst\|$dst, $src}",
	[(set VR128X:$dst,
	(v2i64 (scalar_to_vector GR64:$src)))],
	IIC_SSE_MOVDQ>, EVEX, VEX_W;
	let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
	def VMOV64toPQIZrm : AVX512BI<0x6E, MRMSrcMem, (outs VR128X:$dst),
	(ins i64mem:$src),
	"vmovq\t{$src, $dst\|$dst, $src}", []>,
	EVEX, VEX_W, EVEX_CD8<64, CD8VT1>;
	let isCodeGenOnly = 1 in {
	def VMOV64toSDZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR64X:$dst), (ins GR64:$src),
	"vmovq\t{$src, $dst\|$dst, $src}",
	[(set FR64X:$dst, (bitconvert GR64:$src))],
	IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteMove]>;
	def VMOV64toSDZrm : AVX512XSI<0x7E, MRMSrcMem, (outs FR64X:$dst), (ins i64mem:$src),
	"vmovq\t{$src, $dst\|$dst, $src}",
	[(set FR64X:$dst, (bitconvert (loadi64 addr:$src)))]>,
	EVEX, VEX_W, EVEX_CD8<8, CD8VT8>;
	def VMOVSDto64Zrr : AVX512BI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64X:$src),
	"vmovq\t{$src, $dst\|$dst, $src}",
	[(set GR64:$dst, (bitconvert FR64X:$src))],
	IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteMove]>;
	def VMOVSDto64Zmr : AVX512BI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64X:$src),
	"vmovq\t{$src, $dst\|$dst, $src}",
	[(store (i64 (bitconvert FR64X:$src)), addr:$dst)],
	IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteStore]>,
	EVEX_CD8<64, CD8VT1>;
	}
	} // ExeDomain = SSEPackedInt

	// Move Int Doubleword to Single Scalar
	//
	let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
	def VMOVDI2SSZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR32X:$dst), (ins GR32:$src),
	"vmovd\t{$src, $dst\|$dst, $src}",
	[(set FR32X:$dst, (bitconvert GR32:$src))],
	IIC_SSE_MOVDQ>, EVEX;

	def VMOVDI2SSZrm : AVX512BI<0x6E, MRMSrcMem, (outs FR32X:$dst), (ins i32mem:$src),
	"vmovd\t{$src, $dst\|$dst, $src}",
	[(set FR32X:$dst, (bitconvert (loadi32 addr:$src)))],
	IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>;
	} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1

	// Move doubleword from xmm register to r/m32
	//
	let ExeDomain = SSEPackedInt in {
	def VMOVPDI2DIZrr : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128X:$src),
	"vmovd\t{$src, $dst\|$dst, $src}",
	[(set GR32:$dst, (extractelt (v4i32 VR128X:$src),
	(iPTR 0)))], IIC_SSE_MOVD_ToGP>,
	EVEX;
	def VMOVPDI2DIZmr : AVX512BI<0x7E, MRMDestMem, (outs),
	(ins i32mem:$dst, VR128X:$src),
	"vmovd\t{$src, $dst\|$dst, $src}",
	[(store (i32 (extractelt (v4i32 VR128X:$src),
	(iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>,
	EVEX, EVEX_CD8<32, CD8VT1>;
	} // ExeDomain = SSEPackedInt

	// Move quadword from xmm1 register to r/m64
	//
	let ExeDomain = SSEPackedInt in {
	def VMOVPQIto64Zrr : I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128X:$src),
	"vmovq\t{$src, $dst\|$dst, $src}",
	[(set GR64:$dst, (extractelt (v2i64 VR128X:$src),
	(iPTR 0)))],
	IIC_SSE_MOVD_ToGP>, PD, EVEX, VEX_W,
	Requires<[HasAVX512, In64BitMode]>;

	let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
	def VMOVPQIto64Zmr : I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128X:$src),
	"vmovq\t{$src, $dst\|$dst, $src}",
	[], IIC_SSE_MOVD_ToGP>, PD, EVEX, VEX_W,
	Requires<[HasAVX512, In64BitMode]>;

	def VMOVPQI2QIZmr : I<0xD6, MRMDestMem, (outs),
	(ins i64mem:$dst, VR128X:$src),
	"vmovq\t{$src, $dst\|$dst, $src}",
	[(store (extractelt (v2i64 VR128X:$src), (iPTR 0)),
	addr:$dst)], IIC_SSE_MOVDQ>,
	EVEX, PD, VEX_W, EVEX_CD8<64, CD8VT1>,
	Sched<[WriteStore]>, Requires<[HasAVX512, In64BitMode]>;

	let hasSideEffects = 0 in
	def VMOVPQI2QIZrr : AVX512BI<0xD6, MRMDestReg, (outs VR128X:$dst),
	(ins VR128X:$src),
	"vmovq.s\t{$src, $dst\|$dst, $src}",[]>,
	EVEX, VEX_W;
	} // ExeDomain = SSEPackedInt

	// Move Scalar Single to Double Int
	//
	let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
	def VMOVSS2DIZrr : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst),
	(ins FR32X:$src),
	"vmovd\t{$src, $dst\|$dst, $src}",
	[(set GR32:$dst, (bitconvert FR32X:$src))],
	IIC_SSE_MOVD_ToGP>, EVEX;
	def VMOVSS2DIZmr : AVX512BI<0x7E, MRMDestMem, (outs),
	(ins i32mem:$dst, FR32X:$src),
	"vmovd\t{$src, $dst\|$dst, $src}",
	[(store (i32 (bitconvert FR32X:$src)), addr:$dst)],
	IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>;
	} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1

	// Move Quadword Int to Packed Quadword Int
	//
	let ExeDomain = SSEPackedInt in {
	def VMOVQI2PQIZrm : AVX512XSI<0x7E, MRMSrcMem, (outs VR128X:$dst),
	(ins i64mem:$src),
	"vmovq\t{$src, $dst\|$dst, $src}",
	[(set VR128X:$dst,
	(v2i64 (scalar_to_vector (loadi64 addr:$src))))]>,
	EVEX, VEX_W, EVEX_CD8<8, CD8VT8>;
	} // ExeDomain = SSEPackedInt

	//===----------------------------------------------------------------------===//
	// AVX-512 MOVSS, MOVSD
	//===----------------------------------------------------------------------===//

	multiclass avx512_move_scalar<string asm, SDNode OpNode,
	X86VectorVTInfo _> {
	def rr : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
	(ins _.RC:$src1, _.FRC:$src2),
	!strconcat(asm, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set _.RC:$dst, (_.VT (OpNode _.RC:$src1,
	(scalar_to_vector _.FRC:$src2))))],
	_.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V;
	def rrkz : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
	(ins _.KRCWM:$mask, _.RC:$src1, _.FRC:$src2),
	!strconcat(asm, "\t{$src2, $src1, $dst {${mask}} {z}\|",
	"$dst {${mask}} {z}, $src1, $src2}"),
	[(set _.RC:$dst, (_.VT (X86selects _.KRCWM:$mask,
	(_.VT (OpNode _.RC:$src1,
	(scalar_to_vector _.FRC:$src2))),
	_.ImmAllZerosV)))],
	_.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V, EVEX_KZ;
	let Constraints = "$src0 = $dst" in
	def rrk : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
	(ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src1, _.FRC:$src2),
	!strconcat(asm, "\t{$src2, $src1, $dst {${mask}}\|",
	"$dst {${mask}}, $src1, $src2}"),
	[(set _.RC:$dst, (_.VT (X86selects _.KRCWM:$mask,
	(_.VT (OpNode _.RC:$src1,
	(scalar_to_vector _.FRC:$src2))),
	(_.VT _.RC:$src0))))],
	_.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V, EVEX_K;
	let canFoldAsLoad = 1, isReMaterializable = 1 in
	def rm : AVX512PI<0x10, MRMSrcMem, (outs _.FRC:$dst), (ins _.ScalarMemOp:$src),
	!strconcat(asm, "\t{$src, $dst\|$dst, $src}"),
	[(set _.FRC:$dst, (_.ScalarLdFrag addr:$src))],
	_.ExeDomain, IIC_SSE_MOV_S_RM>, EVEX;
	let mayLoad = 1, hasSideEffects = 0 in {
	let Constraints = "$src0 = $dst" in
	def rmk : AVX512PI<0x10, MRMSrcMem, (outs _.RC:$dst),
	(ins _.RC:$src0, _.KRCWM:$mask, _.ScalarMemOp:$src),
	!strconcat(asm, "\t{$src, $dst {${mask}}\|",
	"$dst {${mask}}, $src}"),
	[], _.ExeDomain, IIC_SSE_MOV_S_RM>, EVEX, EVEX_K;
	def rmkz : AVX512PI<0x10, MRMSrcMem, (outs _.RC:$dst),
	(ins _.KRCWM:$mask, _.ScalarMemOp:$src),
	!strconcat(asm, "\t{$src, $dst {${mask}} {z}\|",
	"$dst {${mask}} {z}, $src}"),
	[], _.ExeDomain, IIC_SSE_MOV_S_RM>, EVEX, EVEX_KZ;
	}
	def mr: AVX512PI<0x11, MRMDestMem, (outs), (ins _.ScalarMemOp:$dst, _.FRC:$src),
	!strconcat(asm, "\t{$src, $dst\|$dst, $src}"),
	[(store _.FRC:$src, addr:$dst)], _.ExeDomain, IIC_SSE_MOV_S_MR>,
	EVEX;
	let mayStore = 1, hasSideEffects = 0 in
	def mrk: AVX512PI<0x11, MRMDestMem, (outs),
	(ins _.ScalarMemOp:$dst, VK1WM:$mask, _.FRC:$src),
	!strconcat(asm, "\t{$src, $dst {${mask}}\|$dst {${mask}}, $src}"),
	[], _.ExeDomain, IIC_SSE_MOV_S_MR>, EVEX, EVEX_K;
	}

	defm VMOVSSZ : avx512_move_scalar<"vmovss", X86Movss, f32x_info>,
	VEX_LIG, XS, EVEX_CD8<32, CD8VT1>;

	defm VMOVSDZ : avx512_move_scalar<"vmovsd", X86Movsd, f64x_info>,
	VEX_LIG, XD, VEX_W, EVEX_CD8<64, CD8VT1>;


	multiclass avx512_move_scalar_lowering<string InstrStr, SDNode OpNode,
	PatLeaf ZeroFP, X86VectorVTInfo _> {

	def : Pat<(_.VT (OpNode _.RC:$src0,
	(_.VT (scalar_to_vector
	(_.EltVT (X86selects (scalar_to_vector (and (i8 (trunc GR32:$mask)), (i8 1))),
	(_.EltVT _.FRC:$src1),
	(_.EltVT _.FRC:$src2))))))),
	(COPY_TO_REGCLASS (!cast<Instruction>(InstrStr#rrk)
	(COPY_TO_REGCLASS _.FRC:$src2, _.RC),
	(COPY_TO_REGCLASS GR32:$mask, VK1WM),
	(_.VT _.RC:$src0), _.FRC:$src1),
	_.RC)>;

	def : Pat<(_.VT (OpNode _.RC:$src0,
	(_.VT (scalar_to_vector
	(_.EltVT (X86selects (scalar_to_vector (and (i8 (trunc GR32:$mask)), (i8 1))),
	(_.EltVT _.FRC:$src1),
	(_.EltVT ZeroFP))))))),
	(COPY_TO_REGCLASS (!cast<Instruction>(InstrStr#rrkz)
	(COPY_TO_REGCLASS GR32:$mask, VK1WM),
	(_.VT _.RC:$src0), _.FRC:$src1),
	_.RC)>;
	}

	multiclass avx512_store_scalar_lowering<string InstrStr, AVX512VLVectorVTInfo _,
	dag Mask, RegisterClass MaskRC> {

	def : Pat<(masked_store addr:$dst, Mask,
	(_.info512.VT (insert_subvector undef,
	(_.info256.VT (insert_subvector undef,
	(_.info128.VT _.info128.RC:$src),
	(iPTR 0))),
	(iPTR 0)))),
	(!cast<Instruction>(InstrStr#mrk) addr:$dst,
	(COPY_TO_REGCLASS MaskRC:$mask, VK1WM),
	(COPY_TO_REGCLASS _.info128.RC:$src, _.info128.FRC))>;

	}

	multiclass avx512_store_scalar_lowering_subreg<string InstrStr,
	AVX512VLVectorVTInfo _,
	dag Mask, RegisterClass MaskRC,
	SubRegIndex subreg> {

	def : Pat<(masked_store addr:$dst, Mask,
	(_.info512.VT (insert_subvector undef,
	(_.info256.VT (insert_subvector undef,
	(_.info128.VT _.info128.RC:$src),
	(iPTR 0))),
	(iPTR 0)))),
	(!cast<Instruction>(InstrStr#mrk) addr:$dst,
	(COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM),
	(COPY_TO_REGCLASS _.info128.RC:$src, _.info128.FRC))>;

	}

	multiclass avx512_load_scalar_lowering<string InstrStr, AVX512VLVectorVTInfo _,
	dag Mask, RegisterClass MaskRC> {

	def : Pat<(_.info128.VT (extract_subvector
	(_.info512.VT (masked_load addr:$srcAddr, Mask,
	(_.info512.VT (bitconvert
	(v16i32 immAllZerosV))))),
	(iPTR 0))),
	(!cast<Instruction>(InstrStr#rmkz)
	(COPY_TO_REGCLASS MaskRC:$mask, VK1WM),
	addr:$srcAddr)>;

	def : Pat<(_.info128.VT (extract_subvector
	(_.info512.VT (masked_load addr:$srcAddr, Mask,
	(_.info512.VT (insert_subvector undef,
	(_.info256.VT (insert_subvector undef,
	(_.info128.VT (X86vzmovl _.info128.RC:$src)),
	(iPTR 0))),
	(iPTR 0))))),
	(iPTR 0))),
	(!cast<Instruction>(InstrStr#rmk) _.info128.RC:$src,
	(COPY_TO_REGCLASS MaskRC:$mask, VK1WM),
	addr:$srcAddr)>;

	}

	multiclass avx512_load_scalar_lowering_subreg<string InstrStr,
	AVX512VLVectorVTInfo _,
	dag Mask, RegisterClass MaskRC,
	SubRegIndex subreg> {

	def : Pat<(_.info128.VT (extract_subvector
	(_.info512.VT (masked_load addr:$srcAddr, Mask,
	(_.info512.VT (bitconvert
	(v16i32 immAllZerosV))))),
	(iPTR 0))),
	(!cast<Instruction>(InstrStr#rmkz)
	(COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM),
	addr:$srcAddr)>;

	def : Pat<(_.info128.VT (extract_subvector
	(_.info512.VT (masked_load addr:$srcAddr, Mask,
	(_.info512.VT (insert_subvector undef,
	(_.info256.VT (insert_subvector undef,
	(_.info128.VT (X86vzmovl _.info128.RC:$src)),
	(iPTR 0))),
	(iPTR 0))))),
	(iPTR 0))),
	(!cast<Instruction>(InstrStr#rmk) _.info128.RC:$src,
	(COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM),
	addr:$srcAddr)>;

	}

	defm : avx512_move_scalar_lowering<"VMOVSSZ", X86Movss, fp32imm0, v4f32x_info>;
	defm : avx512_move_scalar_lowering<"VMOVSDZ", X86Movsd, fp64imm0, v2f64x_info>;

	defm : avx512_store_scalar_lowering<"VMOVSSZ", avx512vl_f32_info,
	(v16i1 (bitconvert (i16 (trunc (and GR32:$mask, (i32 1)))))), GR32>;
	defm : avx512_store_scalar_lowering_subreg<"VMOVSSZ", avx512vl_f32_info,
	(v16i1 (bitconvert (i16 (and GR16:$mask, (i16 1))))), GR16, sub_16bit>;
	defm : avx512_store_scalar_lowering_subreg<"VMOVSDZ", avx512vl_f64_info,
	(v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8, sub_8bit>;

	defm : avx512_load_scalar_lowering<"VMOVSSZ", avx512vl_f32_info,
	(v16i1 (bitconvert (i16 (trunc (and GR32:$mask, (i32 1)))))), GR32>;
	defm : avx512_load_scalar_lowering_subreg<"VMOVSSZ", avx512vl_f32_info,
	(v16i1 (bitconvert (i16 (and GR16:$mask, (i16 1))))), GR16, sub_16bit>;
	defm : avx512_load_scalar_lowering_subreg<"VMOVSDZ", avx512vl_f64_info,
	(v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8, sub_8bit>;

	def : Pat<(f32 (X86selects VK1WM:$mask, (f32 FR32X:$src1), (f32 FR32X:$src2))),
	(COPY_TO_REGCLASS (VMOVSSZrrk (COPY_TO_REGCLASS FR32X:$src2, VR128X),
	VK1WM:$mask, (v4f32 (IMPLICIT_DEF)), FR32X:$src1), FR32X)>;

	def : Pat<(f64 (X86selects VK1WM:$mask, (f64 FR64X:$src1), (f64 FR64X:$src2))),
	(COPY_TO_REGCLASS (VMOVSDZrrk (COPY_TO_REGCLASS FR64X:$src2, VR128X),
	VK1WM:$mask, (v2f64 (IMPLICIT_DEF)), FR64X:$src1), FR64X)>;

	def : Pat<(int_x86_avx512_mask_store_ss addr:$dst, VR128X:$src, GR8:$mask),
	(VMOVSSZmrk addr:$dst, (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), GR8:$mask, sub_8bit)), VK1WM),
	(COPY_TO_REGCLASS VR128X:$src, FR32X))>;

	let hasSideEffects = 0 in {
	def VMOVSSZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
	(ins VR128X:$src1, FR32X:$src2),
	"vmovss.s\t{$src2, $src1, $dst\|$dst, $src1, $src2}",
	[], NoItinerary>, XS, EVEX_4V, VEX_LIG,
	FoldGenData<"VMOVSSZrr">;

	let Constraints = "$src0 = $dst" in
	def VMOVSSZrrk_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
	(ins f32x_info.RC:$src0, f32x_info.KRCWM:$mask,
	VR128X:$src1, FR32X:$src2),
	"vmovss.s\t{$src2, $src1, $dst {${mask}}\|"#
	"$dst {${mask}}, $src1, $src2}",
	[], NoItinerary>, EVEX_K, XS, EVEX_4V, VEX_LIG,
	FoldGenData<"VMOVSSZrrk">;

	def VMOVSSZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
	(ins f32x_info.KRCWM:$mask, VR128X:$src1, FR32X:$src2),
	"vmovss.s\t{$src2, $src1, $dst {${mask}} {z}\|"#
	"$dst {${mask}} {z}, $src1, $src2}",
	[], NoItinerary>, EVEX_KZ, XS, EVEX_4V, VEX_LIG,
	FoldGenData<"VMOVSSZrrkz">;

	def VMOVSDZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
	(ins VR128X:$src1, FR64X:$src2),
	"vmovsd.s\t{$src2, $src1, $dst\|$dst, $src1, $src2}",
	[], NoItinerary>, XD, EVEX_4V, VEX_LIG, VEX_W,
	FoldGenData<"VMOVSDZrr">;

	let Constraints = "$src0 = $dst" in
	def VMOVSDZrrk_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
	(ins f64x_info.RC:$src0, f64x_info.KRCWM:$mask,
	VR128X:$src1, FR64X:$src2),
	"vmovsd.s\t{$src2, $src1, $dst {${mask}}\|"#
	"$dst {${mask}}, $src1, $src2}",
	[], NoItinerary>, EVEX_K, XD, EVEX_4V, VEX_LIG,
	VEX_W, FoldGenData<"VMOVSDZrrk">;

	def VMOVSDZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
	(ins f64x_info.KRCWM:$mask, VR128X:$src1,
	FR64X:$src2),
	"vmovsd.s\t{$src2, $src1, $dst {${mask}} {z}\|"#
	"$dst {${mask}} {z}, $src1, $src2}",
	[], NoItinerary>, EVEX_KZ, XD, EVEX_4V, VEX_LIG,
	VEX_W, FoldGenData<"VMOVSDZrrkz">;
	}

	let Predicates = [HasAVX512] in {
	let AddedComplexity = 15 in {
	// Move scalar to XMM zero-extended, zeroing a VR128X then do a
	// MOVS{S,D} to the lower bits.
	def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32X:$src)))),
	(VMOVSSZrr (v4f32 (AVX512_128_SET0)), FR32X:$src)>;
	def : Pat<(v4f32 (X86vzmovl (v4f32 VR128X:$src))),
	(VMOVSSZrr (v4f32 (AVX512_128_SET0)), (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
	def : Pat<(v4i32 (X86vzmovl (v4i32 VR128X:$src))),
	(VMOVSSZrr (v4i32 (AVX512_128_SET0)), (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
	def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64X:$src)))),
	(VMOVSDZrr (v2f64 (AVX512_128_SET0)), FR64X:$src)>;
	}

	// Move low f32 and clear high bits.
	def : Pat<(v8f32 (X86vzmovl (v8f32 VR256X:$src))),
	(SUBREG_TO_REG (i32 0),
	(VMOVSSZrr (v4f32 (AVX512_128_SET0)),
	(EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm)), sub_xmm)>;
	def : Pat<(v8i32 (X86vzmovl (v8i32 VR256X:$src))),
	(SUBREG_TO_REG (i32 0),
	(VMOVSSZrr (v4i32 (AVX512_128_SET0)),
	(EXTRACT_SUBREG (v8i32 VR256X:$src), sub_xmm)), sub_xmm)>;
	def : Pat<(v16f32 (X86vzmovl (v16f32 VR512:$src))),
	(SUBREG_TO_REG (i32 0),
	(VMOVSSZrr (v4f32 (AVX512_128_SET0)),
	(EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm)), sub_xmm)>;
	def : Pat<(v16i32 (X86vzmovl (v16i32 VR512:$src))),
	(SUBREG_TO_REG (i32 0),
	(VMOVSSZrr (v4i32 (AVX512_128_SET0)),
	(EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm)), sub_xmm)>;

	let AddedComplexity = 20 in {
	// MOVSSrm zeros the high parts of the register; represent this
	// with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
	def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
	(COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>;
	def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
	(COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>;
	def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
	(COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>;
	def : Pat<(v4f32 (X86vzload addr:$src)),
	(COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>;

	// MOVSDrm zeros the high parts of the register; represent this
	// with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
	def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
	(COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
	def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
	(COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
	def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
	(COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
	def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
	(COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
	def : Pat<(v2f64 (X86vzload addr:$src)),
	(COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;

	// Represent the same patterns above but in the form they appear for
	// 256-bit types
	def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
	(v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
	(SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrm addr:$src), sub_xmm)>;
	def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
	(v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
	(SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
	def : Pat<(v8f32 (X86vzload addr:$src)),
	(SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
	def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
	(v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
	(SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
	def : Pat<(v4f64 (X86vzload addr:$src)),
	(SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;

	// Represent the same patterns above but in the form they appear for
	// 512-bit types
	def : Pat<(v16i32 (X86vzmovl (insert_subvector undef,
	(v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
	(SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrm addr:$src), sub_xmm)>;
	def : Pat<(v16f32 (X86vzmovl (insert_subvector undef,
	(v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
	(SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
	def : Pat<(v16f32 (X86vzload addr:$src)),
	(SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
	def : Pat<(v8f64 (X86vzmovl (insert_subvector undef,
	(v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
	(SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
	def : Pat<(v8f64 (X86vzload addr:$src)),
	(SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
	}
	def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
	(v4f32 (scalar_to_vector FR32X:$src)), (iPTR 0)))),
	(SUBREG_TO_REG (i32 0), (v4f32 (VMOVSSZrr (v4f32 (AVX512_128_SET0)),
	FR32X:$src)), sub_xmm)>;
	def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
	(v2f64 (scalar_to_vector FR64X:$src)), (iPTR 0)))),
	(SUBREG_TO_REG (i64 0), (v2f64 (VMOVSDZrr (v2f64 (AVX512_128_SET0)),
	FR64X:$src)), sub_xmm)>;
	def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
	(v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))),
	(SUBREG_TO_REG (i64 0), (VMOVQI2PQIZrm addr:$src), sub_xmm)>;

	// Move low f64 and clear high bits.
	def : Pat<(v4f64 (X86vzmovl (v4f64 VR256X:$src))),
	(SUBREG_TO_REG (i32 0),
	(VMOVSDZrr (v2f64 (AVX512_128_SET0)),
	(EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)), sub_xmm)>;
	def : Pat<(v8f64 (X86vzmovl (v8f64 VR512:$src))),
	(SUBREG_TO_REG (i32 0),
	(VMOVSDZrr (v2f64 (AVX512_128_SET0)),
	(EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)), sub_xmm)>;

	def : Pat<(v4i64 (X86vzmovl (v4i64 VR256X:$src))),
	(SUBREG_TO_REG (i32 0), (VMOVSDZrr (v2i64 (AVX512_128_SET0)),
	(EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm)), sub_xmm)>;
	def : Pat<(v8i64 (X86vzmovl (v8i64 VR512:$src))),
	(SUBREG_TO_REG (i32 0), (VMOVSDZrr (v2i64 (AVX512_128_SET0)),
	(EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm)), sub_xmm)>;

	// Extract and store.
	def : Pat<(store (f32 (extractelt (v4f32 VR128X:$src), (iPTR 0))),
	addr:$dst),
	(VMOVSSZmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128X:$src), FR32X))>;

	// Shuffle with VMOVSS
	def : Pat<(v4i32 (X86Movss VR128X:$src1, VR128X:$src2)),
	(VMOVSSZrr (v4i32 VR128X:$src1),
	(COPY_TO_REGCLASS (v4i32 VR128X:$src2), FR32X))>;
	def : Pat<(v4f32 (X86Movss VR128X:$src1, VR128X:$src2)),
	(VMOVSSZrr (v4f32 VR128X:$src1),
	(COPY_TO_REGCLASS (v4f32 VR128X:$src2), FR32X))>;

	// 256-bit variants
	def : Pat<(v8i32 (X86Movss VR256X:$src1, VR256X:$src2)),
	(SUBREG_TO_REG (i32 0),
	(VMOVSSZrr (EXTRACT_SUBREG (v8i32 VR256X:$src1), sub_xmm),
	(EXTRACT_SUBREG (v8i32 VR256X:$src2), sub_xmm)),
	sub_xmm)>;
	def : Pat<(v8f32 (X86Movss VR256X:$src1, VR256X:$src2)),
	(SUBREG_TO_REG (i32 0),
	(VMOVSSZrr (EXTRACT_SUBREG (v8f32 VR256X:$src1), sub_xmm),
	(EXTRACT_SUBREG (v8f32 VR256X:$src2), sub_xmm)),
	sub_xmm)>;

	// Shuffle with VMOVSD
	def : Pat<(v2i64 (X86Movsd VR128X:$src1, VR128X:$src2)),
	(VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
	def : Pat<(v2f64 (X86Movsd VR128X:$src1, VR128X:$src2)),
	(VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;

	// 256-bit variants
	def : Pat<(v4i64 (X86Movsd VR256X:$src1, VR256X:$src2)),
	(SUBREG_TO_REG (i32 0),
	(VMOVSDZrr (EXTRACT_SUBREG (v4i64 VR256X:$src1), sub_xmm),
	(EXTRACT_SUBREG (v4i64 VR256X:$src2), sub_xmm)),
	sub_xmm)>;
	def : Pat<(v4f64 (X86Movsd VR256X:$src1, VR256X:$src2)),
	(SUBREG_TO_REG (i32 0),
	(VMOVSDZrr (EXTRACT_SUBREG (v4f64 VR256X:$src1), sub_xmm),
	(EXTRACT_SUBREG (v4f64 VR256X:$src2), sub_xmm)),
	sub_xmm)>;

	def : Pat<(v2f64 (X86Movlpd VR128X:$src1, VR128X:$src2)),
	(VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
	def : Pat<(v2i64 (X86Movlpd VR128X:$src1, VR128X:$src2)),
	(VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
	def : Pat<(v4f32 (X86Movlps VR128X:$src1, VR128X:$src2)),
	(VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
	def : Pat<(v4i32 (X86Movlps VR128X:$src1, VR128X:$src2)),
	(VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
	}

	let AddedComplexity = 15 in
	def VMOVZPQILo2PQIZrr : AVX512XSI<0x7E, MRMSrcReg, (outs VR128X:$dst),
	(ins VR128X:$src),
	"vmovq\t{$src, $dst\|$dst, $src}",
	[(set VR128X:$dst, (v2i64 (X86vzmovl
	(v2i64 VR128X:$src))))],
	IIC_SSE_MOVQ_RR>, EVEX, VEX_W;

	let Predicates = [HasAVX512] in {
	let AddedComplexity = 15 in {
	def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
	(VMOVDI2PDIZrr GR32:$src)>;

	def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
	(VMOV64toPQIZrr GR64:$src)>;

	def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
	(v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
	(SUBREG_TO_REG (i64 0), (VMOV64toPQIZrr GR64:$src), sub_xmm)>;

	def : Pat<(v8i64 (X86vzmovl (insert_subvector undef,
	(v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
	(SUBREG_TO_REG (i64 0), (VMOV64toPQIZrr GR64:$src), sub_xmm)>;
	}
	// AVX 128-bit movd/movq instruction write zeros in the high 128-bit part.
	let AddedComplexity = 20 in {
	def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
	(VMOVDI2PDIZrm addr:$src)>;
	def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
	(VMOVDI2PDIZrm addr:$src)>;
	def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
	(VMOVDI2PDIZrm addr:$src)>;
	def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
	(VMOVDI2PDIZrm addr:$src)>;
	def : Pat<(v4i32 (X86vzload addr:$src)),
	(VMOVDI2PDIZrm addr:$src)>;
	def : Pat<(v8i32 (X86vzload addr:$src)),
	(SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrm addr:$src), sub_xmm)>;
	def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
	(VMOVQI2PQIZrm addr:$src)>;
	def : Pat<(v2f64 (X86vzmovl (v2f64 VR128X:$src))),
	(VMOVZPQILo2PQIZrr VR128X:$src)>;
	def : Pat<(v2i64 (X86vzload addr:$src)),
	(VMOVQI2PQIZrm addr:$src)>;
	def : Pat<(v4i64 (X86vzload addr:$src)),
	(SUBREG_TO_REG (i64 0), (VMOVQI2PQIZrm addr:$src), sub_xmm)>;
	}

	// Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
	def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
	(v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
	(SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src), sub_xmm)>;
	def : Pat<(v16i32 (X86vzmovl (insert_subvector undef,
	(v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
	(SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src), sub_xmm)>;

	// Use regular 128-bit instructions to match 512-bit scalar_to_vec+zext.
	def : Pat<(v16i32 (X86vzload addr:$src)),
	(SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrm addr:$src), sub_xmm)>;
	def : Pat<(v8i64 (X86vzload addr:$src)),
	(SUBREG_TO_REG (i64 0), (VMOVQI2PQIZrm addr:$src), sub_xmm)>;
	}
	//===----------------------------------------------------------------------===//
	// AVX-512 - Non-temporals
	//===----------------------------------------------------------------------===//
	let SchedRW = [WriteLoad] in {
	def VMOVNTDQAZrm : AVX512PI<0x2A, MRMSrcMem, (outs VR512:$dst),
	(ins i512mem:$src), "vmovntdqa\t{$src, $dst\|$dst, $src}",
	[], SSEPackedInt>, EVEX, T8PD, EVEX_V512,
	EVEX_CD8<64, CD8VF>;

	let Predicates = [HasVLX] in {
	def VMOVNTDQAZ256rm : AVX512PI<0x2A, MRMSrcMem, (outs VR256X:$dst),
	(ins i256mem:$src),
	"vmovntdqa\t{$src, $dst\|$dst, $src}",
	[], SSEPackedInt>, EVEX, T8PD, EVEX_V256,
	EVEX_CD8<64, CD8VF>;

	def VMOVNTDQAZ128rm : AVX512PI<0x2A, MRMSrcMem, (outs VR128X:$dst),
	(ins i128mem:$src),
	"vmovntdqa\t{$src, $dst\|$dst, $src}",
	[], SSEPackedInt>, EVEX, T8PD, EVEX_V128,
	EVEX_CD8<64, CD8VF>;
	}
	}

	multiclass avx512_movnt<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
	PatFrag st_frag = alignednontemporalstore,
	InstrItinClass itin = IIC_SSE_MOVNT> {
	let SchedRW = [WriteStore], AddedComplexity = 400 in
	def mr : AVX512PI<opc, MRMDestMem, (outs), (ins _.MemOp:$dst, _.RC:$src),
	!strconcat(OpcodeStr, "\t{$src, $dst\|$dst, $src}"),
	[(st_frag (_.VT _.RC:$src), addr:$dst)],
	_.ExeDomain, itin>, EVEX, EVEX_CD8<_.EltSize, CD8VF>;
	}

	multiclass avx512_movnt_vl<bits<8> opc, string OpcodeStr,
	AVX512VLVectorVTInfo VTInfo> {
	let Predicates = [HasAVX512] in
	defm Z : avx512_movnt<opc, OpcodeStr, VTInfo.info512>, EVEX_V512;

	let Predicates = [HasAVX512, HasVLX] in {
	defm Z256 : avx512_movnt<opc, OpcodeStr, VTInfo.info256>, EVEX_V256;
	defm Z128 : avx512_movnt<opc, OpcodeStr, VTInfo.info128>, EVEX_V128;
	}
	}

	defm VMOVNTDQ : avx512_movnt_vl<0xE7, "vmovntdq", avx512vl_i64_info>, PD;
	defm VMOVNTPD : avx512_movnt_vl<0x2B, "vmovntpd", avx512vl_f64_info>, PD, VEX_W;
	defm VMOVNTPS : avx512_movnt_vl<0x2B, "vmovntps", avx512vl_f32_info>, PS;

	let Predicates = [HasAVX512], AddedComplexity = 400 in {
	def : Pat<(alignednontemporalstore (v16i32 VR512:$src), addr:$dst),
	(VMOVNTDQZmr addr:$dst, VR512:$src)>;
	def : Pat<(alignednontemporalstore (v32i16 VR512:$src), addr:$dst),
	(VMOVNTDQZmr addr:$dst, VR512:$src)>;
	def : Pat<(alignednontemporalstore (v64i8 VR512:$src), addr:$dst),
	(VMOVNTDQZmr addr:$dst, VR512:$src)>;

	def : Pat<(v8f64 (alignednontemporalload addr:$src)),
	(VMOVNTDQAZrm addr:$src)>;
	def : Pat<(v16f32 (alignednontemporalload addr:$src)),
	(VMOVNTDQAZrm addr:$src)>;
	def : Pat<(v8i64 (alignednontemporalload addr:$src)),
	(VMOVNTDQAZrm addr:$src)>;
	def : Pat<(v16i32 (bitconvert (v8i64 (alignednontemporalload addr:$src)))),
	(VMOVNTDQAZrm addr:$src)>;
	def : Pat<(v32i16 (bitconvert (v8i64 (alignednontemporalload addr:$src)))),
	(VMOVNTDQAZrm addr:$src)>;
	def : Pat<(v64i8 (bitconvert (v8i64 (alignednontemporalload addr:$src)))),
	(VMOVNTDQAZrm addr:$src)>;
	}

	let Predicates = [HasVLX], AddedComplexity = 400 in {
	def : Pat<(alignednontemporalstore (v8i32 VR256X:$src), addr:$dst),
	(VMOVNTDQZ256mr addr:$dst, VR256X:$src)>;
	def : Pat<(alignednontemporalstore (v16i16 VR256X:$src), addr:$dst),
	(VMOVNTDQZ256mr addr:$dst, VR256X:$src)>;
	def : Pat<(alignednontemporalstore (v32i8 VR256X:$src), addr:$dst),
	(VMOVNTDQZ256mr addr:$dst, VR256X:$src)>;

	def : Pat<(v4f64 (alignednontemporalload addr:$src)),
	(VMOVNTDQAZ256rm addr:$src)>;
	def : Pat<(v8f32 (alignednontemporalload addr:$src)),
	(VMOVNTDQAZ256rm addr:$src)>;
	def : Pat<(v4i64 (alignednontemporalload addr:$src)),
	(VMOVNTDQAZ256rm addr:$src)>;
	def : Pat<(v8i32 (bitconvert (v2i64 (alignednontemporalload addr:$src)))),
	(VMOVNTDQAZ256rm addr:$src)>;
	def : Pat<(v16i16 (bitconvert (v2i64 (alignednontemporalload addr:$src)))),
	(VMOVNTDQAZ256rm addr:$src)>;
	def : Pat<(v32i8 (bitconvert (v2i64 (alignednontemporalload addr:$src)))),
	(VMOVNTDQAZ256rm addr:$src)>;

	def : Pat<(alignednontemporalstore (v4i32 VR128X:$src), addr:$dst),
	(VMOVNTDQZ128mr addr:$dst, VR128X:$src)>;
	def : Pat<(alignednontemporalstore (v8i16 VR128X:$src), addr:$dst),
	(VMOVNTDQZ128mr addr:$dst, VR128X:$src)>;
	def : Pat<(alignednontemporalstore (v16i8 VR128X:$src), addr:$dst),
	(VMOVNTDQZ128mr addr:$dst, VR128X:$src)>;

	def : Pat<(v2f64 (alignednontemporalload addr:$src)),
	(VMOVNTDQAZ128rm addr:$src)>;
	def : Pat<(v4f32 (alignednontemporalload addr:$src)),
	(VMOVNTDQAZ128rm addr:$src)>;
	def : Pat<(v2i64 (alignednontemporalload addr:$src)),
	(VMOVNTDQAZ128rm addr:$src)>;
	def : Pat<(v4i32 (bitconvert (v2i64 (alignednontemporalload addr:$src)))),
	(VMOVNTDQAZ128rm addr:$src)>;
	def : Pat<(v8i16 (bitconvert (v2i64 (alignednontemporalload addr:$src)))),
	(VMOVNTDQAZ128rm addr:$src)>;
	def : Pat<(v16i8 (bitconvert (v2i64 (alignednontemporalload addr:$src)))),
	(VMOVNTDQAZ128rm addr:$src)>;
	}

	//===----------------------------------------------------------------------===//
	// AVX-512 - Integer arithmetic
	//
	multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
	X86VectorVTInfo _, OpndItins itins,
	bit IsCommutable = 0> {
	defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
	(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
	"$src2, $src1", "$src1, $src2",
	(_.VT (OpNode _.RC:$src1, _.RC:$src2)),
	itins.rr, IsCommutable>,
	AVX512BIBase, EVEX_4V;

	defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
	(ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
	"$src2, $src1", "$src1, $src2",
	(_.VT (OpNode _.RC:$src1,
	(bitconvert (_.LdFrag addr:$src2)))),
	itins.rm>,
	AVX512BIBase, EVEX_4V;
	}

	multiclass avx512_binop_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
	X86VectorVTInfo _, OpndItins itins,
	bit IsCommutable = 0> :
	avx512_binop_rm<opc, OpcodeStr, OpNode, _, itins, IsCommutable> {
	defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
	(ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
	"${src2}"##_.BroadcastStr##", $src1",
	"$src1, ${src2}"##_.BroadcastStr,
	(_.VT (OpNode _.RC:$src1,
	(X86VBroadcast
	(_.ScalarLdFrag addr:$src2)))),
	itins.rm>,
	AVX512BIBase, EVEX_4V, EVEX_B;
	}

	multiclass avx512_binop_rm_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
	AVX512VLVectorVTInfo VTInfo, OpndItins itins,
	Predicate prd, bit IsCommutable = 0> {
	let Predicates = [prd] in
	defm Z : avx512_binop_rm<opc, OpcodeStr, OpNode, VTInfo.info512, itins,
	IsCommutable>, EVEX_V512;

	let Predicates = [prd, HasVLX] in {
	defm Z256 : avx512_binop_rm<opc, OpcodeStr, OpNode, VTInfo.info256, itins,
	IsCommutable>, EVEX_V256;
	defm Z128 : avx512_binop_rm<opc, OpcodeStr, OpNode, VTInfo.info128, itins,
	IsCommutable>, EVEX_V128;
	}
	}

	multiclass avx512_binop_rmb_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
	AVX512VLVectorVTInfo VTInfo, OpndItins itins,
	Predicate prd, bit IsCommutable = 0> {
	let Predicates = [prd] in
	defm Z : avx512_binop_rmb<opc, OpcodeStr, OpNode, VTInfo.info512, itins,
	IsCommutable>, EVEX_V512;

	let Predicates = [prd, HasVLX] in {
	defm Z256 : avx512_binop_rmb<opc, OpcodeStr, OpNode, VTInfo.info256, itins,
	IsCommutable>, EVEX_V256;
	defm Z128 : avx512_binop_rmb<opc, OpcodeStr, OpNode, VTInfo.info128, itins,
	IsCommutable>, EVEX_V128;
	}
	}

	multiclass avx512_binop_rm_vl_q<bits<8> opc, string OpcodeStr, SDNode OpNode,
	OpndItins itins, Predicate prd,
	bit IsCommutable = 0> {
	defm NAME : avx512_binop_rmb_vl<opc, OpcodeStr, OpNode, avx512vl_i64_info,
	itins, prd, IsCommutable>,
	VEX_W, EVEX_CD8<64, CD8VF>;
	}

	multiclass avx512_binop_rm_vl_d<bits<8> opc, string OpcodeStr, SDNode OpNode,
	OpndItins itins, Predicate prd,
	bit IsCommutable = 0> {
	defm NAME : avx512_binop_rmb_vl<opc, OpcodeStr, OpNode, avx512vl_i32_info,
	itins, prd, IsCommutable>, EVEX_CD8<32, CD8VF>;
	}

	multiclass avx512_binop_rm_vl_w<bits<8> opc, string OpcodeStr, SDNode OpNode,
	OpndItins itins, Predicate prd,
	bit IsCommutable = 0> {
	defm NAME : avx512_binop_rm_vl<opc, OpcodeStr, OpNode, avx512vl_i16_info,
	itins, prd, IsCommutable>, EVEX_CD8<16, CD8VF>;
	}

	multiclass avx512_binop_rm_vl_b<bits<8> opc, string OpcodeStr, SDNode OpNode,
	OpndItins itins, Predicate prd,
	bit IsCommutable = 0> {
	defm NAME : avx512_binop_rm_vl<opc, OpcodeStr, OpNode, avx512vl_i8_info,
	itins, prd, IsCommutable>, EVEX_CD8<8, CD8VF>;
	}

	multiclass avx512_binop_rm_vl_dq<bits<8> opc_d, bits<8> opc_q, string OpcodeStr,
	SDNode OpNode, OpndItins itins, Predicate prd,
	bit IsCommutable = 0> {
	defm Q : avx512_binop_rm_vl_q<opc_q, OpcodeStr#"q", OpNode, itins, prd,
	IsCommutable>;

	defm D : avx512_binop_rm_vl_d<opc_d, OpcodeStr#"d", OpNode, itins, prd,
	IsCommutable>;
	}

	multiclass avx512_binop_rm_vl_bw<bits<8> opc_b, bits<8> opc_w, string OpcodeStr,
	SDNode OpNode, OpndItins itins, Predicate prd,
	bit IsCommutable = 0> {
	defm W : avx512_binop_rm_vl_w<opc_w, OpcodeStr#"w", OpNode, itins, prd,
	IsCommutable>;

	defm B : avx512_binop_rm_vl_b<opc_b, OpcodeStr#"b", OpNode, itins, prd,
	IsCommutable>;
	}

	multiclass avx512_binop_rm_vl_all<bits<8> opc_b, bits<8> opc_w,
	bits<8> opc_d, bits<8> opc_q,
	string OpcodeStr, SDNode OpNode,
	OpndItins itins, bit IsCommutable = 0> {
	defm NAME : avx512_binop_rm_vl_dq<opc_d, opc_q, OpcodeStr, OpNode,
	itins, HasAVX512, IsCommutable>,
	avx512_binop_rm_vl_bw<opc_b, opc_w, OpcodeStr, OpNode,
	itins, HasBWI, IsCommutable>;
	}

	multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr, OpndItins itins,
	SDNode OpNode,X86VectorVTInfo _Src,
	X86VectorVTInfo _Dst, X86VectorVTInfo _Brdct,
	bit IsCommutable = 0> {
	defm rr : AVX512_maskable<opc, MRMSrcReg, _Dst, (outs _Dst.RC:$dst),
	(ins _Src.RC:$src1, _Src.RC:$src2), OpcodeStr,
	"$src2, $src1","$src1, $src2",
	(_Dst.VT (OpNode
	(_Src.VT _Src.RC:$src1),
	(_Src.VT _Src.RC:$src2))),
	itins.rr, IsCommutable>,
	AVX512BIBase, EVEX_4V;
	defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
	(ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr,
	"$src2, $src1", "$src1, $src2",
	(_Dst.VT (OpNode (_Src.VT _Src.RC:$src1),
	(bitconvert (_Src.LdFrag addr:$src2)))),
	itins.rm>,
	AVX512BIBase, EVEX_4V;

	defm rmb : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
	(ins _Src.RC:$src1, _Brdct.ScalarMemOp:$src2),
	OpcodeStr,
	"${src2}"##_Brdct.BroadcastStr##", $src1",
	"$src1, ${src2}"##_Brdct.BroadcastStr,
	(_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert
	(_Brdct.VT (X86VBroadcast
	(_Brdct.ScalarLdFrag addr:$src2)))))),
	itins.rm>,
	AVX512BIBase, EVEX_4V, EVEX_B;
	}

	defm VPADD : avx512_binop_rm_vl_all<0xFC, 0xFD, 0xFE, 0xD4, "vpadd", add,
	SSE_INTALU_ITINS_P, 1>;
	defm VPSUB : avx512_binop_rm_vl_all<0xF8, 0xF9, 0xFA, 0xFB, "vpsub", sub,
	SSE_INTALU_ITINS_P, 0>;
	defm VPADDS : avx512_binop_rm_vl_bw<0xEC, 0xED, "vpadds", X86adds,
	SSE_INTALU_ITINS_P, HasBWI, 1>;
	defm VPSUBS : avx512_binop_rm_vl_bw<0xE8, 0xE9, "vpsubs", X86subs,
	SSE_INTALU_ITINS_P, HasBWI, 0>;
	defm VPADDUS : avx512_binop_rm_vl_bw<0xDC, 0xDD, "vpaddus", X86addus,
	SSE_INTALU_ITINS_P, HasBWI, 1>;
	defm VPSUBUS : avx512_binop_rm_vl_bw<0xD8, 0xD9, "vpsubus", X86subus,
	SSE_INTALU_ITINS_P, HasBWI, 0>;
	defm VPMULLD : avx512_binop_rm_vl_d<0x40, "vpmulld", mul,
	SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;
	defm VPMULLW : avx512_binop_rm_vl_w<0xD5, "vpmullw", mul,
	SSE_INTALU_ITINS_P, HasBWI, 1>;
	defm VPMULLQ : avx512_binop_rm_vl_q<0x40, "vpmullq", mul,
	SSE_INTALU_ITINS_P, HasDQI, 1>, T8PD;
	defm VPMULHW : avx512_binop_rm_vl_w<0xE5, "vpmulhw", mulhs, SSE_INTALU_ITINS_P,
	HasBWI, 1>;
	defm VPMULHUW : avx512_binop_rm_vl_w<0xE4, "vpmulhuw", mulhu, SSE_INTMUL_ITINS_P,
	HasBWI, 1>;
	defm VPMULHRSW : avx512_binop_rm_vl_w<0x0B, "vpmulhrsw", X86mulhrs, SSE_INTMUL_ITINS_P,
	HasBWI, 1>, T8PD;
	defm VPAVG : avx512_binop_rm_vl_bw<0xE0, 0xE3, "vpavg", X86avg,
	SSE_INTALU_ITINS_P, HasBWI, 1>;

	multiclass avx512_binop_all<bits<8> opc, string OpcodeStr, OpndItins itins,
	AVX512VLVectorVTInfo _SrcVTInfo, AVX512VLVectorVTInfo _DstVTInfo,
	SDNode OpNode, Predicate prd, bit IsCommutable = 0> {
	let Predicates = [prd] in
	defm NAME#Z : avx512_binop_rm2<opc, OpcodeStr, itins, OpNode,
	_SrcVTInfo.info512, _DstVTInfo.info512,
	v8i64_info, IsCommutable>,
	EVEX_V512, EVEX_CD8<64, CD8VF>, VEX_W;
	let Predicates = [HasVLX, prd] in {
	defm NAME#Z256 : avx512_binop_rm2<opc, OpcodeStr, itins, OpNode,
	_SrcVTInfo.info256, _DstVTInfo.info256,
	v4i64x_info, IsCommutable>,
	EVEX_V256, EVEX_CD8<64, CD8VF>, VEX_W;
	defm NAME#Z128 : avx512_binop_rm2<opc, OpcodeStr, itins, OpNode,
	_SrcVTInfo.info128, _DstVTInfo.info128,
	v2i64x_info, IsCommutable>,
	EVEX_V128, EVEX_CD8<64, CD8VF>, VEX_W;
	}
	}

	defm VPMULDQ : avx512_binop_all<0x28, "vpmuldq", SSE_INTALU_ITINS_P,
	avx512vl_i32_info, avx512vl_i64_info,
	X86pmuldq, HasAVX512, 1>,T8PD;
	defm VPMULUDQ : avx512_binop_all<0xF4, "vpmuludq", SSE_INTMUL_ITINS_P,
	avx512vl_i32_info, avx512vl_i64_info,
	X86pmuludq, HasAVX512, 1>;
	defm VPMULTISHIFTQB : avx512_binop_all<0x83, "vpmultishiftqb", SSE_INTALU_ITINS_P,
	avx512vl_i8_info, avx512vl_i8_info,
	X86multishift, HasVBMI, 0>, T8PD;

	multiclass avx512_packs_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
	X86VectorVTInfo _Src, X86VectorVTInfo _Dst> {
	defm rmb : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
	(ins _Src.RC:$src1, _Src.ScalarMemOp:$src2),
	OpcodeStr,
	"${src2}"##_Src.BroadcastStr##", $src1",
	"$src1, ${src2}"##_Src.BroadcastStr,
	(_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert
	(_Src.VT (X86VBroadcast
	(_Src.ScalarLdFrag addr:$src2))))))>,
	EVEX_4V, EVEX_B, EVEX_CD8<_Src.EltSize, CD8VF>;
	}

	multiclass avx512_packs_rm<bits<8> opc, string OpcodeStr,
	SDNode OpNode,X86VectorVTInfo _Src,
	X86VectorVTInfo _Dst, bit IsCommutable = 0> {
	defm rr : AVX512_maskable<opc, MRMSrcReg, _Dst, (outs _Dst.RC:$dst),
	(ins _Src.RC:$src1, _Src.RC:$src2), OpcodeStr,
	"$src2, $src1","$src1, $src2",
	(_Dst.VT (OpNode
	(_Src.VT _Src.RC:$src1),
	(_Src.VT _Src.RC:$src2))),
	NoItinerary, IsCommutable>,
	EVEX_CD8<_Src.EltSize, CD8VF>, EVEX_4V;
	defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
	(ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr,
	"$src2, $src1", "$src1, $src2",
	(_Dst.VT (OpNode (_Src.VT _Src.RC:$src1),
	(bitconvert (_Src.LdFrag addr:$src2))))>,
	EVEX_4V, EVEX_CD8<_Src.EltSize, CD8VF>;
	}

	multiclass avx512_packs_all_i32_i16<bits<8> opc, string OpcodeStr,
	SDNode OpNode> {
	let Predicates = [HasBWI] in
	defm NAME#Z : avx512_packs_rm<opc, OpcodeStr, OpNode, v16i32_info,
	v32i16_info>,
	avx512_packs_rmb<opc, OpcodeStr, OpNode, v16i32_info,
	v32i16_info>, EVEX_V512;
	let Predicates = [HasBWI, HasVLX] in {
	defm NAME#Z256 : avx512_packs_rm<opc, OpcodeStr, OpNode, v8i32x_info,
	v16i16x_info>,
	avx512_packs_rmb<opc, OpcodeStr, OpNode, v8i32x_info,
	v16i16x_info>, EVEX_V256;
	defm NAME#Z128 : avx512_packs_rm<opc, OpcodeStr, OpNode, v4i32x_info,
	v8i16x_info>,
	avx512_packs_rmb<opc, OpcodeStr, OpNode, v4i32x_info,
	v8i16x_info>, EVEX_V128;
	}
	}
	multiclass avx512_packs_all_i16_i8<bits<8> opc, string OpcodeStr,
	SDNode OpNode> {
	let Predicates = [HasBWI] in
	defm NAME#Z : avx512_packs_rm<opc, OpcodeStr, OpNode, v32i16_info,
	v64i8_info>, EVEX_V512;
	let Predicates = [HasBWI, HasVLX] in {
	defm NAME#Z256 : avx512_packs_rm<opc, OpcodeStr, OpNode, v16i16x_info,
	v32i8x_info>, EVEX_V256;
	defm NAME#Z128 : avx512_packs_rm<opc, OpcodeStr, OpNode, v8i16x_info,
	v16i8x_info>, EVEX_V128;
	}
	}

	multiclass avx512_vpmadd<bits<8> opc, string OpcodeStr,
	SDNode OpNode, AVX512VLVectorVTInfo _Src,
	AVX512VLVectorVTInfo _Dst, bit IsCommutable = 0> {
	let Predicates = [HasBWI] in
	defm NAME#Z : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info512,
	_Dst.info512, IsCommutable>, EVEX_V512;
	let Predicates = [HasBWI, HasVLX] in {
	defm NAME#Z256 : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info256,
	_Dst.info256, IsCommutable>, EVEX_V256;
	defm NAME#Z128 : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info128,
	_Dst.info128, IsCommutable>, EVEX_V128;
	}
	}

	defm VPACKSSDW : avx512_packs_all_i32_i16<0x6B, "vpackssdw", X86Packss>, AVX512BIBase;
	defm VPACKUSDW : avx512_packs_all_i32_i16<0x2b, "vpackusdw", X86Packus>, AVX5128IBase;
	defm VPACKSSWB : avx512_packs_all_i16_i8 <0x63, "vpacksswb", X86Packss>, AVX512BIBase;
	defm VPACKUSWB : avx512_packs_all_i16_i8 <0x67, "vpackuswb", X86Packus>, AVX512BIBase;

	defm VPMADDUBSW : avx512_vpmadd<0x04, "vpmaddubsw", X86vpmaddubsw,
	avx512vl_i8_info, avx512vl_i16_info>, AVX512BIBase, T8PD;
	defm VPMADDWD : avx512_vpmadd<0xF5, "vpmaddwd", X86vpmaddwd,
	avx512vl_i16_info, avx512vl_i32_info, 1>, AVX512BIBase;

	defm VPMAXSB : avx512_binop_rm_vl_b<0x3C, "vpmaxsb", smax,
	SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD;
	defm VPMAXSW : avx512_binop_rm_vl_w<0xEE, "vpmaxsw", smax,
	SSE_INTALU_ITINS_P, HasBWI, 1>;
	defm VPMAXS : avx512_binop_rm_vl_dq<0x3D, 0x3D, "vpmaxs", smax,
	SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;

	defm VPMAXUB : avx512_binop_rm_vl_b<0xDE, "vpmaxub", umax,
	SSE_INTALU_ITINS_P, HasBWI, 1>;
	defm VPMAXUW : avx512_binop_rm_vl_w<0x3E, "vpmaxuw", umax,
	SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD;
	defm VPMAXU : avx512_binop_rm_vl_dq<0x3F, 0x3F, "vpmaxu", umax,
	SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;

	defm VPMINSB : avx512_binop_rm_vl_b<0x38, "vpminsb", smin,
	SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD;
	defm VPMINSW : avx512_binop_rm_vl_w<0xEA, "vpminsw", smin,
	SSE_INTALU_ITINS_P, HasBWI, 1>;
	defm VPMINS : avx512_binop_rm_vl_dq<0x39, 0x39, "vpmins", smin,
	SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;

	defm VPMINUB : avx512_binop_rm_vl_b<0xDA, "vpminub", umin,
	SSE_INTALU_ITINS_P, HasBWI, 1>;
	defm VPMINUW : avx512_binop_rm_vl_w<0x3A, "vpminuw", umin,
	SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD;
	defm VPMINU : avx512_binop_rm_vl_dq<0x3B, 0x3B, "vpminu", umin,
	SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;

	// PMULLQ: Use 512bit version to implement 128/256 bit in case NoVLX.
	let Predicates = [HasDQI, NoVLX] in {
	def : Pat<(v4i64 (mul (v4i64 VR256X:$src1), (v4i64 VR256X:$src2))),
	(EXTRACT_SUBREG
	(VPMULLQZrr
	(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src1, sub_ymm),
	(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)),
	sub_ymm)>;

	def : Pat<(v2i64 (mul (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))),
	(EXTRACT_SUBREG
	(VPMULLQZrr
	(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src1, sub_xmm),
	(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)),
	sub_xmm)>;
	}

	//===----------------------------------------------------------------------===//
	// AVX-512 Logical Instructions
	//===----------------------------------------------------------------------===//

	multiclass avx512_logic_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
	X86VectorVTInfo _, bit IsCommutable = 0> {
	defm rr : AVX512_maskable_logic<opc, MRMSrcReg, _, (outs _.RC:$dst),
	(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
	"$src2, $src1", "$src1, $src2",
	(_.i64VT (OpNode (bitconvert (_.VT _.RC:$src1)),
	(bitconvert (_.VT _.RC:$src2)))),
	(_.VT (bitconvert (_.i64VT (OpNode _.RC:$src1,
	_.RC:$src2)))),
	IIC_SSE_BIT_P_RR, IsCommutable>,
	AVX512BIBase, EVEX_4V;

	defm rm : AVX512_maskable_logic<opc, MRMSrcMem, _, (outs _.RC:$dst),
	(ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
	"$src2, $src1", "$src1, $src2",
	(_.i64VT (OpNode (bitconvert (_.VT _.RC:$src1)),
	(bitconvert (_.LdFrag addr:$src2)))),
	(_.VT (bitconvert (_.i64VT (OpNode _.RC:$src1,
	(bitconvert (_.LdFrag addr:$src2)))))),
	IIC_SSE_BIT_P_RM>,
	AVX512BIBase, EVEX_4V;
	}

	multiclass avx512_logic_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
	X86VectorVTInfo _, bit IsCommutable = 0> :
	avx512_logic_rm<opc, OpcodeStr, OpNode, _, IsCommutable> {
	defm rmb : AVX512_maskable_logic<opc, MRMSrcMem, _, (outs _.RC:$dst),
	(ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
	"${src2}"##_.BroadcastStr##", $src1",
	"$src1, ${src2}"##_.BroadcastStr,
	(_.i64VT (OpNode _.RC:$src1,
	(bitconvert
	(_.VT (X86VBroadcast
	(_.ScalarLdFrag addr:$src2)))))),
	(_.VT (bitconvert (_.i64VT (OpNode _.RC:$src1,
	(bitconvert
	(_.VT (X86VBroadcast
	(_.ScalarLdFrag addr:$src2)))))))),
	IIC_SSE_BIT_P_RM>,
	AVX512BIBase, EVEX_4V, EVEX_B;
	}

	multiclass avx512_logic_rmb_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
	AVX512VLVectorVTInfo VTInfo,
	bit IsCommutable = 0> {
	let Predicates = [HasAVX512] in
	defm Z : avx512_logic_rmb<opc, OpcodeStr, OpNode, VTInfo.info512,
	IsCommutable>, EVEX_V512;

	let Predicates = [HasAVX512, HasVLX] in {
	defm Z256 : avx512_logic_rmb<opc, OpcodeStr, OpNode, VTInfo.info256,
	IsCommutable>, EVEX_V256;
	defm Z128 : avx512_logic_rmb<opc, OpcodeStr, OpNode, VTInfo.info128,
	IsCommutable>, EVEX_V128;
	}
	}

	multiclass avx512_logic_rm_vl_d<bits<8> opc, string OpcodeStr, SDNode OpNode,
	bit IsCommutable = 0> {
	defm NAME : avx512_logic_rmb_vl<opc, OpcodeStr, OpNode, avx512vl_i32_info,
	IsCommutable>, EVEX_CD8<32, CD8VF>;
	}

	multiclass avx512_logic_rm_vl_q<bits<8> opc, string OpcodeStr, SDNode OpNode,
	bit IsCommutable = 0> {
	defm NAME : avx512_logic_rmb_vl<opc, OpcodeStr, OpNode, avx512vl_i64_info,
	IsCommutable>,
	VEX_W, EVEX_CD8<64, CD8VF>;
	}

	multiclass avx512_logic_rm_vl_dq<bits<8> opc_d, bits<8> opc_q, string OpcodeStr,
	SDNode OpNode, bit IsCommutable = 0> {
	defm Q : avx512_logic_rm_vl_q<opc_q, OpcodeStr#"q", OpNode, IsCommutable>;
	defm D : avx512_logic_rm_vl_d<opc_d, OpcodeStr#"d", OpNode, IsCommutable>;
	}

	defm VPAND : avx512_logic_rm_vl_dq<0xDB, 0xDB, "vpand", and, 1>;
	defm VPOR : avx512_logic_rm_vl_dq<0xEB, 0xEB, "vpor", or, 1>;
	defm VPXOR : avx512_logic_rm_vl_dq<0xEF, 0xEF, "vpxor", xor, 1>;
	defm VPANDN : avx512_logic_rm_vl_dq<0xDF, 0xDF, "vpandn", X86andnp>;

	//===----------------------------------------------------------------------===//
	// AVX-512 FP arithmetic
	//===----------------------------------------------------------------------===//
	multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
	SDNode OpNode, SDNode VecNode, OpndItins itins,
	bit IsCommutable> {
	let ExeDomain = _.ExeDomain in {
	defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
	(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
	"$src2, $src1", "$src1, $src2",
	(_.VT (VecNode _.RC:$src1, _.RC:$src2,
	(i32 FROUND_CURRENT))),
	itins.rr>;

	defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
	(ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
	"$src2, $src1", "$src1, $src2",
	(_.VT (VecNode _.RC:$src1,
	_.ScalarIntMemCPat:$src2,
	(i32 FROUND_CURRENT))),
	itins.rm>;
	let isCodeGenOnly = 1, Predicates = [HasAVX512] in {
	def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst),
	(ins _.FRC:$src1, _.FRC:$src2),
	OpcodeStr#"\t{$src2, $src1, $dst\|$dst, $src1, $src2}",
	[(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))],
	itins.rr> {
	let isCommutable = IsCommutable;
	}
	def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst),
	(ins _.FRC:$src1, _.ScalarMemOp:$src2),
	OpcodeStr#"\t{$src2, $src1, $dst\|$dst, $src1, $src2}",
	[(set _.FRC:$dst, (OpNode _.FRC:$src1,
	(_.ScalarLdFrag addr:$src2)))], itins.rm>;
	}
	}
	}

	multiclass avx512_fp_scalar_round<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
	SDNode VecNode, OpndItins itins, bit IsCommutable = 0> {
	let ExeDomain = _.ExeDomain in
	defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
	(ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr,
	"$rc, $src2, $src1", "$src1, $src2, $rc",
	(VecNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
	(i32 imm:$rc)), itins.rr, IsCommutable>,
	EVEX_B, EVEX_RC;
	}
	multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
	SDNode OpNode, SDNode VecNode, SDNode SaeNode,
	OpndItins itins, bit IsCommutable> {
	let ExeDomain = _.ExeDomain in {
	defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
	(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
	"$src2, $src1", "$src1, $src2",
	(_.VT (VecNode _.RC:$src1, _.RC:$src2)),
	itins.rr>;

	defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
	(ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
	"$src2, $src1", "$src1, $src2",
	(_.VT (VecNode _.RC:$src1,
	_.ScalarIntMemCPat:$src2)),
	itins.rm>;

	let isCodeGenOnly = 1, Predicates = [HasAVX512] in {
	def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst),
	(ins _.FRC:$src1, _.FRC:$src2),
	OpcodeStr#"\t{$src2, $src1, $dst\|$dst, $src1, $src2}",
	[(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))],
	itins.rr> {
	let isCommutable = IsCommutable;
	}
	def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst),
	(ins _.FRC:$src1, _.ScalarMemOp:$src2),
	OpcodeStr#"\t{$src2, $src1, $dst\|$dst, $src1, $src2}",
	[(set _.FRC:$dst, (OpNode _.FRC:$src1,
	(_.ScalarLdFrag addr:$src2)))], itins.rm>;
	}

	defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
	(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
	"{sae}, $src2, $src1", "$src1, $src2, {sae}",
	(SaeNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
	(i32 FROUND_NO_EXC))>, EVEX_B;
	}
	}

	multiclass avx512_binop_s_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
	SDNode VecNode,
	SizeItins itins, bit IsCommutable> {
	defm SSZ : avx512_fp_scalar<opc, OpcodeStr#"ss", f32x_info, OpNode, VecNode,
	itins.s, IsCommutable>,
	avx512_fp_scalar_round<opc, OpcodeStr#"ss", f32x_info, VecNode,
	itins.s, IsCommutable>,
	XS, EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>;
	defm SDZ : avx512_fp_scalar<opc, OpcodeStr#"sd", f64x_info, OpNode, VecNode,
	itins.d, IsCommutable>,
	avx512_fp_scalar_round<opc, OpcodeStr#"sd", f64x_info, VecNode,
	itins.d, IsCommutable>,
	XD, VEX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>;
	}

	multiclass avx512_binop_s_sae<bits<8> opc, string OpcodeStr, SDNode OpNode,
	SDNode VecNode, SDNode SaeNode,
	SizeItins itins, bit IsCommutable> {
	defm SSZ : avx512_fp_scalar_sae<opc, OpcodeStr#"ss", f32x_info, OpNode,
	VecNode, SaeNode, itins.s, IsCommutable>,
	XS, EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>;
	defm SDZ : avx512_fp_scalar_sae<opc, OpcodeStr#"sd", f64x_info, OpNode,
	VecNode, SaeNode, itins.d, IsCommutable>,
	XD, VEX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>;
	}
	defm VADD : avx512_binop_s_round<0x58, "vadd", fadd, X86faddRnds, SSE_ALU_ITINS_S, 1>;
	defm VMUL : avx512_binop_s_round<0x59, "vmul", fmul, X86fmulRnds, SSE_MUL_ITINS_S, 1>;
	defm VSUB : avx512_binop_s_round<0x5C, "vsub", fsub, X86fsubRnds, SSE_ALU_ITINS_S, 0>;
	defm VDIV : avx512_binop_s_round<0x5E, "vdiv", fdiv, X86fdivRnds, SSE_DIV_ITINS_S, 0>;
	defm VMIN : avx512_binop_s_sae <0x5D, "vmin", X86fmin, X86fmins, X86fminRnds,
	SSE_ALU_ITINS_S, 0>;
	defm VMAX : avx512_binop_s_sae <0x5F, "vmax", X86fmax, X86fmaxs, X86fmaxRnds,
	SSE_ALU_ITINS_S, 0>;

	// MIN/MAX nodes are commutable under "unsafe-fp-math". In this case we use
	// X86fminc and X86fmaxc instead of X86fmin and X86fmax
	multiclass avx512_comutable_binop_s<bits<8> opc, string OpcodeStr,
	X86VectorVTInfo _, SDNode OpNode, OpndItins itins> {
	let isCodeGenOnly = 1, Predicates = [HasAVX512], ExeDomain = _.ExeDomain in {
	def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst),
	(ins _.FRC:$src1, _.FRC:$src2),
	OpcodeStr#"\t{$src2, $src1, $dst\|$dst, $src1, $src2}",
	[(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))],
	itins.rr> {
	let isCommutable = 1;
	}
	def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst),
	(ins _.FRC:$src1, _.ScalarMemOp:$src2),
	OpcodeStr#"\t{$src2, $src1, $dst\|$dst, $src1, $src2}",
	[(set _.FRC:$dst, (OpNode _.FRC:$src1,
	(_.ScalarLdFrag addr:$src2)))], itins.rm>;
	}
	}
	defm VMINCSSZ : avx512_comutable_binop_s<0x5D, "vminss", f32x_info, X86fminc,
	SSE_ALU_ITINS_S.s>, XS, EVEX_4V, VEX_LIG,
	EVEX_CD8<32, CD8VT1>;

	defm VMINCSDZ : avx512_comutable_binop_s<0x5D, "vminsd", f64x_info, X86fminc,
	SSE_ALU_ITINS_S.d>, XD, VEX_W, EVEX_4V, VEX_LIG,
	EVEX_CD8<64, CD8VT1>;

	defm VMAXCSSZ : avx512_comutable_binop_s<0x5F, "vmaxss", f32x_info, X86fmaxc,
	SSE_ALU_ITINS_S.s>, XS, EVEX_4V, VEX_LIG,
	EVEX_CD8<32, CD8VT1>;

	defm VMAXCSDZ : avx512_comutable_binop_s<0x5F, "vmaxsd", f64x_info, X86fmaxc,
	SSE_ALU_ITINS_S.d>, XD, VEX_W, EVEX_4V, VEX_LIG,
	EVEX_CD8<64, CD8VT1>;

	multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
	X86VectorVTInfo _, OpndItins itins,
	bit IsCommutable> {
	let ExeDomain = _.ExeDomain, hasSideEffects = 0 in {
	defm rr: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
	(ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
	"$src2, $src1", "$src1, $src2",
	(_.VT (OpNode _.RC:$src1, _.RC:$src2)), itins.rr,
	IsCommutable>, EVEX_4V;
	let mayLoad = 1 in {
	defm rm: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
	(ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix,
	"$src2, $src1", "$src1, $src2",
	(OpNode _.RC:$src1, (_.LdFrag addr:$src2)), itins.rm>,
	EVEX_4V;
	defm rmb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
	(ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix,
	"${src2}"##_.BroadcastStr##", $src1",
	"$src1, ${src2}"##_.BroadcastStr,
	(OpNode _.RC:$src1, (_.VT (X86VBroadcast
	(_.ScalarLdFrag addr:$src2)))),
	itins.rm>, EVEX_4V, EVEX_B;
	}
	}
	}

	multiclass avx512_fp_round_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNodeRnd,
	X86VectorVTInfo _> {
	let ExeDomain = _.ExeDomain in
	defm rb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
	(ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr##_.Suffix,
	"$rc, $src2, $src1", "$src1, $src2, $rc",
	(_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 imm:$rc)))>,
	EVEX_4V, EVEX_B, EVEX_RC;
	}


	multiclass avx512_fp_sae_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNodeRnd,
	X86VectorVTInfo _> {
	let ExeDomain = _.ExeDomain in
	defm rb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
	(ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
	"{sae}, $src2, $src1", "$src1, $src2, {sae}",
	(_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 FROUND_NO_EXC)))>,
	EVEX_4V, EVEX_B;
	}

	multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
	Predicate prd, SizeItins itins,
	bit IsCommutable = 0> {
	let Predicates = [prd] in {
	defm PSZ : avx512_fp_packed<opc, OpcodeStr, OpNode, v16f32_info,
	itins.s, IsCommutable>, EVEX_V512, PS,
	EVEX_CD8<32, CD8VF>;
	defm PDZ : avx512_fp_packed<opc, OpcodeStr, OpNode, v8f64_info,
	itins.d, IsCommutable>, EVEX_V512, PD, VEX_W,
	EVEX_CD8<64, CD8VF>;
	}

	// Define only if AVX512VL feature is present.
	let Predicates = [prd, HasVLX] in {
	defm PSZ128 : avx512_fp_packed<opc, OpcodeStr, OpNode, v4f32x_info,
	itins.s, IsCommutable>, EVEX_V128, PS,
	EVEX_CD8<32, CD8VF>;
	defm PSZ256 : avx512_fp_packed<opc, OpcodeStr, OpNode, v8f32x_info,
	itins.s, IsCommutable>, EVEX_V256, PS,
	EVEX_CD8<32, CD8VF>;
	defm PDZ128 : avx512_fp_packed<opc, OpcodeStr, OpNode, v2f64x_info,
	itins.d, IsCommutable>, EVEX_V128, PD, VEX_W,
	EVEX_CD8<64, CD8VF>;
	defm PDZ256 : avx512_fp_packed<opc, OpcodeStr, OpNode, v4f64x_info,
	itins.d, IsCommutable>, EVEX_V256, PD, VEX_W,
	EVEX_CD8<64, CD8VF>;
	}
	}

	multiclass avx512_fp_binop_p_round<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd> {
	defm PSZ : avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, v16f32_info>,
	EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
	defm PDZ : avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, v8f64_info>,
	EVEX_V512, PD, VEX_W,EVEX_CD8<64, CD8VF>;
	}

	multiclass avx512_fp_binop_p_sae<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd> {
	defm PSZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, v16f32_info>,
	EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
	defm PDZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, v8f64_info>,
	EVEX_V512, PD, VEX_W,EVEX_CD8<64, CD8VF>;
	}

	defm VADD : avx512_fp_binop_p<0x58, "vadd", fadd, HasAVX512,
	SSE_ALU_ITINS_P, 1>,
	avx512_fp_binop_p_round<0x58, "vadd", X86faddRnd>;
	defm VMUL : avx512_fp_binop_p<0x59, "vmul", fmul, HasAVX512,
	SSE_MUL_ITINS_P, 1>,
	avx512_fp_binop_p_round<0x59, "vmul", X86fmulRnd>;
	defm VSUB : avx512_fp_binop_p<0x5C, "vsub", fsub, HasAVX512, SSE_ALU_ITINS_P>,
	avx512_fp_binop_p_round<0x5C, "vsub", X86fsubRnd>;
	defm VDIV : avx512_fp_binop_p<0x5E, "vdiv", fdiv, HasAVX512, SSE_DIV_ITINS_P>,
	avx512_fp_binop_p_round<0x5E, "vdiv", X86fdivRnd>;
	defm VMIN : avx512_fp_binop_p<0x5D, "vmin", X86fmin, HasAVX512,
	SSE_ALU_ITINS_P, 0>,
	avx512_fp_binop_p_sae<0x5D, "vmin", X86fminRnd>;
	defm VMAX : avx512_fp_binop_p<0x5F, "vmax", X86fmax, HasAVX512,
	SSE_ALU_ITINS_P, 0>,
	avx512_fp_binop_p_sae<0x5F, "vmax", X86fmaxRnd>;
	let isCodeGenOnly = 1 in {
	defm VMINC : avx512_fp_binop_p<0x5D, "vmin", X86fminc, HasAVX512,
	SSE_ALU_ITINS_P, 1>;
	defm VMAXC : avx512_fp_binop_p<0x5F, "vmax", X86fmaxc, HasAVX512,
	SSE_ALU_ITINS_P, 1>;
	}
	defm VAND : avx512_fp_binop_p<0x54, "vand", null_frag, HasDQI,
	SSE_ALU_ITINS_P, 1>;
	defm VANDN : avx512_fp_binop_p<0x55, "vandn", null_frag, HasDQI,
	SSE_ALU_ITINS_P, 0>;
	defm VOR : avx512_fp_binop_p<0x56, "vor", null_frag, HasDQI,
	SSE_ALU_ITINS_P, 1>;
	defm VXOR : avx512_fp_binop_p<0x57, "vxor", null_frag, HasDQI,
	SSE_ALU_ITINS_P, 1>;

	// Patterns catch floating point selects with bitcasted integer logic ops.
	multiclass avx512_fp_logical_lowering<string InstrStr, SDNode OpNode,
	X86VectorVTInfo _, Predicate prd> {
	let Predicates = [prd] in {
	// Masked register-register logical operations.
	def : Pat<(_.VT (vselect _.KRCWM:$mask,
	(bitconvert (_.i64VT (OpNode _.RC:$src1, _.RC:$src2))),
	_.RC:$src0)),
	(!cast<Instruction>(InstrStr#rrk) _.RC:$src0, _.KRCWM:$mask,
	_.RC:$src1, _.RC:$src2)>;
	def : Pat<(_.VT (vselect _.KRCWM:$mask,
	(bitconvert (_.i64VT (OpNode _.RC:$src1, _.RC:$src2))),
	_.ImmAllZerosV)),
	(!cast<Instruction>(InstrStr#rrkz) _.KRCWM:$mask, _.RC:$src1,
	_.RC:$src2)>;
	// Masked register-memory logical operations.
	def : Pat<(_.VT (vselect _.KRCWM:$mask,
	(bitconvert (_.i64VT (OpNode _.RC:$src1,
	(load addr:$src2)))),
	_.RC:$src0)),
	(!cast<Instruction>(InstrStr#rmk) _.RC:$src0, _.KRCWM:$mask,
	_.RC:$src1, addr:$src2)>;
	def : Pat<(_.VT (vselect _.KRCWM:$mask,
	(bitconvert (_.i64VT (OpNode _.RC:$src1, (load addr:$src2)))),
	_.ImmAllZerosV)),
	(!cast<Instruction>(InstrStr#rmkz) _.KRCWM:$mask, _.RC:$src1,
	addr:$src2)>;
	// Register-broadcast logical operations.
	def : Pat<(_.i64VT (OpNode _.RC:$src1,
	(bitconvert (_.VT (X86VBroadcast
	(_.ScalarLdFrag addr:$src2)))))),
	(!cast<Instruction>(InstrStr#rmb) _.RC:$src1, addr:$src2)>;
	def : Pat<(_.VT (vselect _.KRCWM:$mask,
	(bitconvert
	(_.i64VT (OpNode _.RC:$src1,
	(bitconvert (_.VT
	(X86VBroadcast
	(_.ScalarLdFrag addr:$src2))))))),
	_.RC:$src0)),
	(!cast<Instruction>(InstrStr#rmbk) _.RC:$src0, _.KRCWM:$mask,
	_.RC:$src1, addr:$src2)>;
	def : Pat<(_.VT (vselect _.KRCWM:$mask,
	(bitconvert
	(_.i64VT (OpNode _.RC:$src1,
	(bitconvert (_.VT
	(X86VBroadcast
	(_.ScalarLdFrag addr:$src2))))))),
	_.ImmAllZerosV)),
	(!cast<Instruction>(InstrStr#rmbkz) _.KRCWM:$mask,
	_.RC:$src1, addr:$src2)>;
	}
	}

	multiclass avx512_fp_logical_lowering_sizes<string InstrStr, SDNode OpNode> {
	defm : avx512_fp_logical_lowering<InstrStr#DZ128, OpNode, v4f32x_info, HasVLX>;
	defm : avx512_fp_logical_lowering<InstrStr#QZ128, OpNode, v2f64x_info, HasVLX>;
	defm : avx512_fp_logical_lowering<InstrStr#DZ256, OpNode, v8f32x_info, HasVLX>;
	defm : avx512_fp_logical_lowering<InstrStr#QZ256, OpNode, v4f64x_info, HasVLX>;
	defm : avx512_fp_logical_lowering<InstrStr#DZ, OpNode, v16f32_info, HasAVX512>;
	defm : avx512_fp_logical_lowering<InstrStr#QZ, OpNode, v8f64_info, HasAVX512>;
	}

	defm : avx512_fp_logical_lowering_sizes<"VPAND", and>;
	defm : avx512_fp_logical_lowering_sizes<"VPOR", or>;
	defm : avx512_fp_logical_lowering_sizes<"VPXOR", xor>;
	defm : avx512_fp_logical_lowering_sizes<"VPANDN", X86andnp>;

	let Predicates = [HasVLX,HasDQI] in {
	// Use packed logical operations for scalar ops.
	def : Pat<(f64 (X86fand FR64X:$src1, FR64X:$src2)),
	(COPY_TO_REGCLASS (VANDPDZ128rr
	(COPY_TO_REGCLASS FR64X:$src1, VR128X),
	(COPY_TO_REGCLASS FR64X:$src2, VR128X)), FR64X)>;
	def : Pat<(f64 (X86for FR64X:$src1, FR64X:$src2)),
	(COPY_TO_REGCLASS (VORPDZ128rr
	(COPY_TO_REGCLASS FR64X:$src1, VR128X),
	(COPY_TO_REGCLASS FR64X:$src2, VR128X)), FR64X)>;
	def : Pat<(f64 (X86fxor FR64X:$src1, FR64X:$src2)),
	(COPY_TO_REGCLASS (VXORPDZ128rr
	(COPY_TO_REGCLASS FR64X:$src1, VR128X),
	(COPY_TO_REGCLASS FR64X:$src2, VR128X)), FR64X)>;
	def : Pat<(f64 (X86fandn FR64X:$src1, FR64X:$src2)),
	(COPY_TO_REGCLASS (VANDNPDZ128rr
	(COPY_TO_REGCLASS FR64X:$src1, VR128X),
	(COPY_TO_REGCLASS FR64X:$src2, VR128X)), FR64X)>;

	def : Pat<(f32 (X86fand FR32X:$src1, FR32X:$src2)),
	(COPY_TO_REGCLASS (VANDPSZ128rr
	(COPY_TO_REGCLASS FR32X:$src1, VR128X),
	(COPY_TO_REGCLASS FR32X:$src2, VR128X)), FR32X)>;
	def : Pat<(f32 (X86for FR32X:$src1, FR32X:$src2)),
	(COPY_TO_REGCLASS (VORPSZ128rr
	(COPY_TO_REGCLASS FR32X:$src1, VR128X),
	(COPY_TO_REGCLASS FR32X:$src2, VR128X)), FR32X)>;
	def : Pat<(f32 (X86fxor FR32X:$src1, FR32X:$src2)),
	(COPY_TO_REGCLASS (VXORPSZ128rr
	(COPY_TO_REGCLASS FR32X:$src1, VR128X),
	(COPY_TO_REGCLASS FR32X:$src2, VR128X)), FR32X)>;
	def : Pat<(f32 (X86fandn FR32X:$src1, FR32X:$src2)),
	(COPY_TO_REGCLASS (VANDNPSZ128rr
	(COPY_TO_REGCLASS FR32X:$src1, VR128X),
	(COPY_TO_REGCLASS FR32X:$src2, VR128X)), FR32X)>;
	}

	multiclass avx512_fp_scalef_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
	X86VectorVTInfo _> {
	let ExeDomain = _.ExeDomain in {
	defm rr: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
	(ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
	"$src2, $src1", "$src1, $src2",
	(_.VT (OpNode _.RC:$src1, _.RC:$src2, (i32 FROUND_CURRENT)))>, EVEX_4V;
	defm rm: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
	(ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix,
	"$src2, $src1", "$src1, $src2",
	(OpNode _.RC:$src1, (_.LdFrag addr:$src2), (i32 FROUND_CURRENT))>, EVEX_4V;
	defm rmb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
	(ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix,
	"${src2}"##_.BroadcastStr##", $src1",
	"$src1, ${src2}"##_.BroadcastStr,
	(OpNode _.RC:$src1, (_.VT (X86VBroadcast
	(_.ScalarLdFrag addr:$src2))), (i32 FROUND_CURRENT))>,
	EVEX_4V, EVEX_B;
	}
	}

	multiclass avx512_fp_scalef_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
	X86VectorVTInfo _> {
	let ExeDomain = _.ExeDomain in {
	defm rr: AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
	(ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
	"$src2, $src1", "$src1, $src2",
	(_.VT (OpNode _.RC:$src1, _.RC:$src2, (i32 FROUND_CURRENT)))>;
	defm rm: AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
	(ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix,
	"$src2, $src1", "$src1, $src2",
	(OpNode _.RC:$src1,
	(_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))),
	(i32 FROUND_CURRENT))>;
	}
	}

	multiclass avx512_fp_scalef_all<bits<8> opc, bits<8> opcScaler, string OpcodeStr, SDNode OpNode, SDNode OpNodeScal> {
	defm PSZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v16f32_info>,
	avx512_fp_round_packed<opc, OpcodeStr, OpNode, v16f32_info>,
	EVEX_V512, EVEX_CD8<32, CD8VF>;
	defm PDZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v8f64_info>,
	avx512_fp_round_packed<opc, OpcodeStr, OpNode, v8f64_info>,
	EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
	defm SSZ128 : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, OpNodeScal, f32x_info>,
	avx512_fp_scalar_round<opcScaler, OpcodeStr##"ss", f32x_info, OpNodeScal, SSE_ALU_ITINS_S.s>,
	EVEX_4V,EVEX_CD8<32, CD8VT1>;
	defm SDZ128 : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, OpNodeScal, f64x_info>,
	avx512_fp_scalar_round<opcScaler, OpcodeStr##"sd", f64x_info, OpNodeScal, SSE_ALU_ITINS_S.d>,
	EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;

	// Define only if AVX512VL feature is present.
	let Predicates = [HasVLX] in {
	defm PSZ128 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v4f32x_info>,
	EVEX_V128, EVEX_CD8<32, CD8VF>;
	defm PSZ256 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v8f32x_info>,
	EVEX_V256, EVEX_CD8<32, CD8VF>;
	defm PDZ128 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v2f64x_info>,
	EVEX_V128, VEX_W, EVEX_CD8<64, CD8VF>;
	defm PDZ256 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v4f64x_info>,
	EVEX_V256, VEX_W, EVEX_CD8<64, CD8VF>;
	}
	}
	defm VSCALEF : avx512_fp_scalef_all<0x2C, 0x2D, "vscalef", X86scalef, X86scalefs>, T8PD;

	//===----------------------------------------------------------------------===//
	// AVX-512 VPTESTM instructions
	//===----------------------------------------------------------------------===//

	multiclass avx512_vptest<bits<8> opc, string OpcodeStr, SDNode OpNode,
	X86VectorVTInfo _> {
	let isCommutable = 1 in
	defm rr : AVX512_maskable_cmp<opc, MRMSrcReg, _, (outs _.KRC:$dst),
	(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
	"$src2, $src1", "$src1, $src2",
	(OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))>,
	EVEX_4V;
	defm rm : AVX512_maskable_cmp<opc, MRMSrcMem, _, (outs _.KRC:$dst),
	(ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
	"$src2, $src1", "$src1, $src2",
	(OpNode (_.VT _.RC:$src1),
	(_.VT (bitconvert (_.LdFrag addr:$src2))))>,
	EVEX_4V,
	EVEX_CD8<_.EltSize, CD8VF>;
	}

	multiclass avx512_vptest_mb<bits<8> opc, string OpcodeStr, SDNode OpNode,
	X86VectorVTInfo _> {
	defm rmb : AVX512_maskable_cmp<opc, MRMSrcMem, _, (outs _.KRC:$dst),
	(ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
	"${src2}"##_.BroadcastStr##", $src1",
	"$src1, ${src2}"##_.BroadcastStr,
	(OpNode (_.VT _.RC:$src1), (_.VT (X86VBroadcast
	(_.ScalarLdFrag addr:$src2))))>,
	EVEX_B, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
	}

	// Use 512bit version to implement 128/256 bit in case NoVLX.
	multiclass avx512_vptest_lowering<SDNode OpNode, X86VectorVTInfo ExtendInfo,
	X86VectorVTInfo _, string Suffix> {
	def : Pat<(_.KVT (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))),
	(_.KVT (COPY_TO_REGCLASS
	(!cast<Instruction>(NAME # Suffix # "Zrr")
	(INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
	_.RC:$src1, _.SubRegIdx),
	(INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
	_.RC:$src2, _.SubRegIdx)),
	_.KRC))>;
	}

	multiclass avx512_vptest_dq_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
	AVX512VLVectorVTInfo _, string Suffix> {
	let Predicates = [HasAVX512] in
	defm Z : avx512_vptest<opc, OpcodeStr, OpNode, _.info512>,
	avx512_vptest_mb<opc, OpcodeStr, OpNode, _.info512>, EVEX_V512;

	let Predicates = [HasAVX512, HasVLX] in {
	defm Z256 : avx512_vptest<opc, OpcodeStr, OpNode, _.info256>,
	avx512_vptest_mb<opc, OpcodeStr, OpNode, _.info256>, EVEX_V256;
	defm Z128 : avx512_vptest<opc, OpcodeStr, OpNode, _.info128>,
	avx512_vptest_mb<opc, OpcodeStr, OpNode, _.info128>, EVEX_V128;
	}
	let Predicates = [HasAVX512, NoVLX] in {
	defm Z256_Alt : avx512_vptest_lowering< OpNode, _.info512, _.info256, Suffix>;
	defm Z128_Alt : avx512_vptest_lowering< OpNode, _.info512, _.info128, Suffix>;
	}
	}

	multiclass avx512_vptest_dq<bits<8> opc, string OpcodeStr, SDNode OpNode> {
	defm D : avx512_vptest_dq_sizes<opc, OpcodeStr#"d", OpNode,
	avx512vl_i32_info, "D">;
	defm Q : avx512_vptest_dq_sizes<opc, OpcodeStr#"q", OpNode,
	avx512vl_i64_info, "Q">, VEX_W;
	}

	multiclass avx512_vptest_wb<bits<8> opc, string OpcodeStr,
	SDNode OpNode> {
	let Predicates = [HasBWI] in {
	defm WZ: avx512_vptest<opc, OpcodeStr#"w", OpNode, v32i16_info>,
	EVEX_V512, VEX_W;
	defm BZ: avx512_vptest<opc, OpcodeStr#"b", OpNode, v64i8_info>,
	EVEX_V512;
	}
	let Predicates = [HasVLX, HasBWI] in {

	defm WZ256: avx512_vptest<opc, OpcodeStr#"w", OpNode, v16i16x_info>,
	EVEX_V256, VEX_W;
	defm WZ128: avx512_vptest<opc, OpcodeStr#"w", OpNode, v8i16x_info>,
	EVEX_V128, VEX_W;
	defm BZ256: avx512_vptest<opc, OpcodeStr#"b", OpNode, v32i8x_info>,
	EVEX_V256;
	defm BZ128: avx512_vptest<opc, OpcodeStr#"b", OpNode, v16i8x_info>,
	EVEX_V128;
	}

	let Predicates = [HasAVX512, NoVLX] in {
	defm BZ256_Alt : avx512_vptest_lowering< OpNode, v64i8_info, v32i8x_info, "B">;
	defm BZ128_Alt : avx512_vptest_lowering< OpNode, v64i8_info, v16i8x_info, "B">;
	defm WZ256_Alt : avx512_vptest_lowering< OpNode, v32i16_info, v16i16x_info, "W">;
	defm WZ128_Alt : avx512_vptest_lowering< OpNode, v32i16_info, v8i16x_info, "W">;
	}

	}

	multiclass avx512_vptest_all_forms<bits<8> opc_wb, bits<8> opc_dq, string OpcodeStr,
	SDNode OpNode> :
	avx512_vptest_wb <opc_wb, OpcodeStr, OpNode>,
	avx512_vptest_dq<opc_dq, OpcodeStr, OpNode>;

	defm VPTESTM : avx512_vptest_all_forms<0x26, 0x27, "vptestm", X86testm>, T8PD;
	defm VPTESTNM : avx512_vptest_all_forms<0x26, 0x27, "vptestnm", X86testnm>, T8XS;


	//===----------------------------------------------------------------------===//
	// AVX-512 Shift instructions
	//===----------------------------------------------------------------------===//
	multiclass avx512_shift_rmi<bits<8> opc, Format ImmFormR, Format ImmFormM,
	string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> {
	let ExeDomain = _.ExeDomain in {
	defm ri : AVX512_maskable<opc, ImmFormR, _, (outs _.RC:$dst),
	(ins _.RC:$src1, u8imm:$src2), OpcodeStr,
	"$src2, $src1", "$src1, $src2",
	(_.VT (OpNode _.RC:$src1, (i8 imm:$src2))),
	SSE_INTSHIFT_ITINS_P.rr>;
	defm mi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst),
	(ins _.MemOp:$src1, u8imm:$src2), OpcodeStr,
	"$src2, $src1", "$src1, $src2",
	(_.VT (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
	(i8 imm:$src2))),
	SSE_INTSHIFT_ITINS_P.rm>;
	}
	}

	multiclass avx512_shift_rmbi<bits<8> opc, Format ImmFormM,
	string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> {
	let ExeDomain = _.ExeDomain in
	defm mbi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst),
	(ins _.ScalarMemOp:$src1, u8imm:$src2), OpcodeStr,
	"$src2, ${src1}"##_.BroadcastStr, "${src1}"##_.BroadcastStr##", $src2",
	(_.VT (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src1)), (i8 imm:$src2))),
	SSE_INTSHIFT_ITINS_P.rm>, EVEX_B;
	}

	multiclass avx512_shift_rrm<bits<8> opc, string OpcodeStr, SDNode OpNode,
	ValueType SrcVT, PatFrag bc_frag, X86VectorVTInfo _> {
	// src2 is always 128-bit
	let ExeDomain = _.ExeDomain in {
	defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
	(ins _.RC:$src1, VR128X:$src2), OpcodeStr,
	"$src2, $src1", "$src1, $src2",
	(_.VT (OpNode _.RC:$src1, (SrcVT VR128X:$src2))),
	SSE_INTSHIFT_ITINS_P.rr>, AVX512BIBase, EVEX_4V;
	defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
	(ins _.RC:$src1, i128mem:$src2), OpcodeStr,
	"$src2, $src1", "$src1, $src2",
	(_.VT (OpNode _.RC:$src1, (bc_frag (loadv2i64 addr:$src2)))),
	SSE_INTSHIFT_ITINS_P.rm>, AVX512BIBase,
	EVEX_4V;
	}
	}

	multiclass avx512_shift_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
	ValueType SrcVT, PatFrag bc_frag,
	AVX512VLVectorVTInfo VTInfo, Predicate prd> {
	let Predicates = [prd] in
	defm Z : avx512_shift_rrm<opc, OpcodeStr, OpNode, SrcVT, bc_frag,
	VTInfo.info512>, EVEX_V512,
	EVEX_CD8<VTInfo.info512.EltSize, CD8VQ> ;
	let Predicates = [prd, HasVLX] in {
	defm Z256 : avx512_shift_rrm<opc, OpcodeStr, OpNode, SrcVT, bc_frag,
	VTInfo.info256>, EVEX_V256,
	EVEX_CD8<VTInfo.info256.EltSize, CD8VH>;
	defm Z128 : avx512_shift_rrm<opc, OpcodeStr, OpNode, SrcVT, bc_frag,
	VTInfo.info128>, EVEX_V128,
	EVEX_CD8<VTInfo.info128.EltSize, CD8VF>;
	}
	}

	multiclass avx512_shift_types<bits<8> opcd, bits<8> opcq, bits<8> opcw,
	string OpcodeStr, SDNode OpNode> {
	defm D : avx512_shift_sizes<opcd, OpcodeStr#"d", OpNode, v4i32, bc_v4i32,
	avx512vl_i32_info, HasAVX512>;
	defm Q : avx512_shift_sizes<opcq, OpcodeStr#"q", OpNode, v2i64, bc_v2i64,
	avx512vl_i64_info, HasAVX512>, VEX_W;
	defm W : avx512_shift_sizes<opcw, OpcodeStr#"w", OpNode, v8i16, bc_v8i16,
	avx512vl_i16_info, HasBWI>;
	}

	multiclass avx512_shift_rmi_sizes<bits<8> opc, Format ImmFormR, Format ImmFormM,
	string OpcodeStr, SDNode OpNode,
	AVX512VLVectorVTInfo VTInfo> {
	let Predicates = [HasAVX512] in
	defm Z: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode,
	VTInfo.info512>,
	avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode,
	VTInfo.info512>, EVEX_V512;
	let Predicates = [HasAVX512, HasVLX] in {
	defm Z256: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode,
	VTInfo.info256>,
	avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode,
	VTInfo.info256>, EVEX_V256;
	defm Z128: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode,
	VTInfo.info128>,
	avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode,
	VTInfo.info128>, EVEX_V128;
	}
	}

	multiclass avx512_shift_rmi_w<bits<8> opcw,
	Format ImmFormR, Format ImmFormM,
	string OpcodeStr, SDNode OpNode> {
	let Predicates = [HasBWI] in
	defm WZ: avx512_shift_rmi<opcw, ImmFormR, ImmFormM, OpcodeStr, OpNode,
	v32i16_info>, EVEX_V512;
	let Predicates = [HasVLX, HasBWI] in {
	defm WZ256: avx512_shift_rmi<opcw, ImmFormR, ImmFormM, OpcodeStr, OpNode,
	v16i16x_info>, EVEX_V256;
	defm WZ128: avx512_shift_rmi<opcw, ImmFormR, ImmFormM, OpcodeStr, OpNode,
	v8i16x_info>, EVEX_V128;
	}
	}

	multiclass avx512_shift_rmi_dq<bits<8> opcd, bits<8> opcq,
	Format ImmFormR, Format ImmFormM,
	string OpcodeStr, SDNode OpNode> {
	defm D: avx512_shift_rmi_sizes<opcd, ImmFormR, ImmFormM, OpcodeStr#"d", OpNode,
	avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
	defm Q: avx512_shift_rmi_sizes<opcq, ImmFormR, ImmFormM, OpcodeStr#"q", OpNode,
	avx512vl_i64_info>, EVEX_CD8<64, CD8VF>, VEX_W;
	}

	defm VPSRL : avx512_shift_rmi_dq<0x72, 0x73, MRM2r, MRM2m, "vpsrl", X86vsrli>,
	avx512_shift_rmi_w<0x71, MRM2r, MRM2m, "vpsrlw", X86vsrli>, AVX512BIi8Base, EVEX_4V;

	defm VPSLL : avx512_shift_rmi_dq<0x72, 0x73, MRM6r, MRM6m, "vpsll", X86vshli>,
	avx512_shift_rmi_w<0x71, MRM6r, MRM6m, "vpsllw", X86vshli>, AVX512BIi8Base, EVEX_4V;

	defm VPSRA : avx512_shift_rmi_dq<0x72, 0x72, MRM4r, MRM4m, "vpsra", X86vsrai>,
	avx512_shift_rmi_w<0x71, MRM4r, MRM4m, "vpsraw", X86vsrai>, AVX512BIi8Base, EVEX_4V;

	defm VPROR : avx512_shift_rmi_dq<0x72, 0x72, MRM0r, MRM0m, "vpror", X86vrotri>, AVX512BIi8Base, EVEX_4V;
	defm VPROL : avx512_shift_rmi_dq<0x72, 0x72, MRM1r, MRM1m, "vprol", X86vrotli>, AVX512BIi8Base, EVEX_4V;

	defm VPSLL : avx512_shift_types<0xF2, 0xF3, 0xF1, "vpsll", X86vshl>;
	defm VPSRA : avx512_shift_types<0xE2, 0xE2, 0xE1, "vpsra", X86vsra>;
	defm VPSRL : avx512_shift_types<0xD2, 0xD3, 0xD1, "vpsrl", X86vsrl>;

	// Use 512bit VPSRA/VPSRAI version to implement v2i64/v4i64 in case NoVLX.
	let Predicates = [HasAVX512, NoVLX] in {
	def : Pat<(v4i64 (X86vsra (v4i64 VR256X:$src1), (v2i64 VR128X:$src2))),
	(EXTRACT_SUBREG (v8i64
	(VPSRAQZrr
	(v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
	VR128X:$src2)), sub_ymm)>;

	def : Pat<(v2i64 (X86vsra (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))),
	(EXTRACT_SUBREG (v8i64
	(VPSRAQZrr
	(v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
	VR128X:$src2)), sub_xmm)>;

	def : Pat<(v4i64 (X86vsrai (v4i64 VR256X:$src1), (i8 imm:$src2))),
	(EXTRACT_SUBREG (v8i64
	(VPSRAQZri
	(v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
	imm:$src2)), sub_ymm)>;

	def : Pat<(v2i64 (X86vsrai (v2i64 VR128X:$src1), (i8 imm:$src2))),
	(EXTRACT_SUBREG (v8i64
	(VPSRAQZri
	(v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
	imm:$src2)), sub_xmm)>;
	}

	//===-------------------------------------------------------------------===//
	// Variable Bit Shifts
	//===-------------------------------------------------------------------===//
	multiclass avx512_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
	X86VectorVTInfo _> {
	let ExeDomain = _.ExeDomain in {
	defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
	(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
	"$src2, $src1", "$src1, $src2",
	(_.VT (OpNode _.RC:$src1, (_.VT _.RC:$src2))),
	SSE_INTSHIFT_ITINS_P.rr>, AVX5128IBase, EVEX_4V;
	defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
	(ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
	"$src2, $src1", "$src1, $src2",
	(_.VT (OpNode _.RC:$src1,
	(_.VT (bitconvert (_.LdFrag addr:$src2))))),
	SSE_INTSHIFT_ITINS_P.rm>, AVX5128IBase, EVEX_4V,
	EVEX_CD8<_.EltSize, CD8VF>;
	}
	}

	multiclass avx512_var_shift_mb<bits<8> opc, string OpcodeStr, SDNode OpNode,
	X86VectorVTInfo _> {
	let ExeDomain = _.ExeDomain in
	defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
	(ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
	"${src2}"##_.BroadcastStr##", $src1",
	"$src1, ${src2}"##_.BroadcastStr,
	(_.VT (OpNode _.RC:$src1, (_.VT (X86VBroadcast
	(_.ScalarLdFrag addr:$src2))))),
	SSE_INTSHIFT_ITINS_P.rm>, AVX5128IBase, EVEX_B,
	EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
	}

	multiclass avx512_var_shift_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
	AVX512VLVectorVTInfo _> {
	let Predicates = [HasAVX512] in
	defm Z : avx512_var_shift<opc, OpcodeStr, OpNode, _.info512>,
	avx512_var_shift_mb<opc, OpcodeStr, OpNode, _.info512>, EVEX_V512;

	let Predicates = [HasAVX512, HasVLX] in {
	defm Z256 : avx512_var_shift<opc, OpcodeStr, OpNode, _.info256>,
	avx512_var_shift_mb<opc, OpcodeStr, OpNode, _.info256>, EVEX_V256;
	defm Z128 : avx512_var_shift<opc, OpcodeStr, OpNode, _.info128>,
	avx512_var_shift_mb<opc, OpcodeStr, OpNode, _.info128>, EVEX_V128;
	}
	}

	multiclass avx512_var_shift_types<bits<8> opc, string OpcodeStr,
	SDNode OpNode> {
	defm D : avx512_var_shift_sizes<opc, OpcodeStr#"d", OpNode,
	avx512vl_i32_info>;
	defm Q : avx512_var_shift_sizes<opc, OpcodeStr#"q", OpNode,
	avx512vl_i64_info>, VEX_W;
	}

	// Use 512bit version to implement 128/256 bit in case NoVLX.
	multiclass avx512_var_shift_lowering<AVX512VLVectorVTInfo _, string OpcodeStr,
	SDNode OpNode, list<Predicate> p> {
	let Predicates = p in {
	def : Pat<(_.info256.VT (OpNode (_.info256.VT _.info256.RC:$src1),
	(_.info256.VT _.info256.RC:$src2))),
	(EXTRACT_SUBREG
	(!cast<Instruction>(OpcodeStr#"Zrr")
	(INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR256X:$src1, sub_ymm),
	(INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)),
	sub_ymm)>;

	def : Pat<(_.info128.VT (OpNode (_.info128.VT _.info128.RC:$src1),
	(_.info128.VT _.info128.RC:$src2))),
	(EXTRACT_SUBREG
	(!cast<Instruction>(OpcodeStr#"Zrr")
	(INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR128X:$src1, sub_xmm),
	(INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)),
	sub_xmm)>;
	}
	}
	multiclass avx512_var_shift_w<bits<8> opc, string OpcodeStr,
	SDNode OpNode> {
	let Predicates = [HasBWI] in
	defm WZ: avx512_var_shift<opc, OpcodeStr, OpNode, v32i16_info>,
	EVEX_V512, VEX_W;
	let Predicates = [HasVLX, HasBWI] in {

	defm WZ256: avx512_var_shift<opc, OpcodeStr, OpNode, v16i16x_info>,
	EVEX_V256, VEX_W;
	defm WZ128: avx512_var_shift<opc, OpcodeStr, OpNode, v8i16x_info>,
	EVEX_V128, VEX_W;
	}
	}

	defm VPSLLV : avx512_var_shift_types<0x47, "vpsllv", shl>,
	avx512_var_shift_w<0x12, "vpsllvw", shl>;

	defm VPSRAV : avx512_var_shift_types<0x46, "vpsrav", sra>,
	avx512_var_shift_w<0x11, "vpsravw", sra>;

	defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", srl>,
	avx512_var_shift_w<0x10, "vpsrlvw", srl>;

	defm VPRORV : avx512_var_shift_types<0x14, "vprorv", rotr>;
	defm VPROLV : avx512_var_shift_types<0x15, "vprolv", rotl>;

	defm : avx512_var_shift_lowering<avx512vl_i64_info, "VPSRAVQ", sra, [HasAVX512, NoVLX]>;
	defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSLLVW", shl, [HasBWI, NoVLX]>;
	defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSRAVW", sra, [HasBWI, NoVLX]>;
	defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSRLVW", srl, [HasBWI, NoVLX]>;

	// Special handing for handling VPSRAV intrinsics.
	multiclass avx512_var_shift_int_lowering<string InstrStr, X86VectorVTInfo _,
	list<Predicate> p> {
	let Predicates = p in {
	def : Pat<(_.VT (X86vsrav _.RC:$src1, _.RC:$src2)),
	(!cast<Instruction>(InstrStr#_.ZSuffix#rr) _.RC:$src1,
	_.RC:$src2)>;
	def : Pat<(_.VT (X86vsrav _.RC:$src1, (bitconvert (_.LdFrag addr:$src2)))),
	(!cast<Instruction>(InstrStr#_.ZSuffix##rm)
	_.RC:$src1, addr:$src2)>;
	def : Pat<(_.VT (vselect _.KRCWM:$mask,
	(X86vsrav _.RC:$src1, _.RC:$src2), _.RC:$src0)),
	(!cast<Instruction>(InstrStr#_.ZSuffix#rrk) _.RC:$src0,
	_.KRC:$mask, _.RC:$src1, _.RC:$src2)>;
	def : Pat<(_.VT (vselect _.KRCWM:$mask,
	(X86vsrav _.RC:$src1, (bitconvert (_.LdFrag addr:$src2))),
	_.RC:$src0)),
	(!cast<Instruction>(InstrStr#_.ZSuffix##rmk) _.RC:$src0,
	_.KRC:$mask, _.RC:$src1, addr:$src2)>;
	def : Pat<(_.VT (vselect _.KRCWM:$mask,
	(X86vsrav _.RC:$src1, _.RC:$src2), _.ImmAllZerosV)),
	(!cast<Instruction>(InstrStr#_.ZSuffix#rrkz) _.KRC:$mask,
	_.RC:$src1, _.RC:$src2)>;
	def : Pat<(_.VT (vselect _.KRCWM:$mask,
	(X86vsrav _.RC:$src1, (bitconvert (_.LdFrag addr:$src2))),
	_.ImmAllZerosV)),
	(!cast<Instruction>(InstrStr#_.ZSuffix##rmkz) _.KRC:$mask,
	_.RC:$src1, addr:$src2)>;
	}
	}

	multiclass avx512_var_shift_int_lowering_mb<string InstrStr, X86VectorVTInfo _,
	list<Predicate> p> :
	avx512_var_shift_int_lowering<InstrStr, _, p> {
	let Predicates = p in {
	def : Pat<(_.VT (X86vsrav _.RC:$src1,
	(X86VBroadcast (_.ScalarLdFrag addr:$src2)))),
	(!cast<Instruction>(InstrStr#_.ZSuffix##rmb)
	_.RC:$src1, addr:$src2)>;
	def : Pat<(_.VT (vselect _.KRCWM:$mask,
	(X86vsrav _.RC:$src1,
	(X86VBroadcast (_.ScalarLdFrag addr:$src2))),
	_.RC:$src0)),
	(!cast<Instruction>(InstrStr#_.ZSuffix##rmbk) _.RC:$src0,
	_.KRC:$mask, _.RC:$src1, addr:$src2)>;
	def : Pat<(_.VT (vselect _.KRCWM:$mask,
	(X86vsrav _.RC:$src1,
	(X86VBroadcast (_.ScalarLdFrag addr:$src2))),
	_.ImmAllZerosV)),
	(!cast<Instruction>(InstrStr#_.ZSuffix##rmbkz) _.KRC:$mask,
	_.RC:$src1, addr:$src2)>;
	}
	}

	defm : avx512_var_shift_int_lowering<"VPSRAVW", v8i16x_info, [HasVLX, HasBWI]>;
	defm : avx512_var_shift_int_lowering<"VPSRAVW", v16i16x_info, [HasVLX, HasBWI]>;
	defm : avx512_var_shift_int_lowering<"VPSRAVW", v32i16_info, [HasBWI]>;
	defm : avx512_var_shift_int_lowering_mb<"VPSRAVD", v4i32x_info, [HasVLX]>;
	defm : avx512_var_shift_int_lowering_mb<"VPSRAVD", v8i32x_info, [HasVLX]>;
	defm : avx512_var_shift_int_lowering_mb<"VPSRAVD", v16i32_info, [HasAVX512]>;
	defm : avx512_var_shift_int_lowering_mb<"VPSRAVQ", v2i64x_info, [HasVLX]>;
	defm : avx512_var_shift_int_lowering_mb<"VPSRAVQ", v4i64x_info, [HasVLX]>;
	defm : avx512_var_shift_int_lowering_mb<"VPSRAVQ", v8i64_info, [HasAVX512]>;


	// Use 512bit VPROL/VPROLI version to implement v2i64/v4i64 + v4i32/v8i32 in case NoVLX.
	let Predicates = [HasAVX512, NoVLX] in {
	def : Pat<(v2i64 (rotl (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))),
	(EXTRACT_SUBREG (v8i64
	(VPROLVQZrr
	(v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
	(INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm))),
	sub_xmm)>;
	def : Pat<(v4i64 (rotl (v4i64 VR256X:$src1), (v4i64 VR256X:$src2))),
	(EXTRACT_SUBREG (v8i64
	(VPROLVQZrr
	(v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
	(INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))),
	sub_ymm)>;

	def : Pat<(v4i32 (rotl (v4i32 VR128X:$src1), (v4i32 VR128X:$src2))),
	(EXTRACT_SUBREG (v16i32
	(VPROLVDZrr
	(v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
	(INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm))),
	sub_xmm)>;
	def : Pat<(v8i32 (rotl (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
	(EXTRACT_SUBREG (v16i32
	(VPROLVDZrr
	(v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
	(INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))),
	sub_ymm)>;

	def : Pat<(v2i64 (X86vrotli (v2i64 VR128X:$src1), (i8 imm:$src2))),
	(EXTRACT_SUBREG (v8i64
	(VPROLQZri
	(v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
	imm:$src2)), sub_xmm)>;
	def : Pat<(v4i64 (X86vrotli (v4i64 VR256X:$src1), (i8 imm:$src2))),
	(EXTRACT_SUBREG (v8i64
	(VPROLQZri
	(v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
	imm:$src2)), sub_ymm)>;

	def : Pat<(v4i32 (X86vrotli (v4i32 VR128X:$src1), (i8 imm:$src2))),
	(EXTRACT_SUBREG (v16i32
	(VPROLDZri
	(v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
	imm:$src2)), sub_xmm)>;
	def : Pat<(v8i32 (X86vrotli (v8i32 VR256X:$src1), (i8 imm:$src2))),
	(EXTRACT_SUBREG (v16i32
	(VPROLDZri
	(v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
	imm:$src2)), sub_ymm)>;
	}

	// Use 512bit VPROR/VPRORI version to implement v2i64/v4i64 + v4i32/v8i32 in case NoVLX.
	let Predicates = [HasAVX512, NoVLX] in {
	def : Pat<(v2i64 (rotr (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))),
	(EXTRACT_SUBREG (v8i64
	(VPRORVQZrr
	(v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
	(INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm))),
	sub_xmm)>;
	def : Pat<(v4i64 (rotr (v4i64 VR256X:$src1), (v4i64 VR256X:$src2))),
	(EXTRACT_SUBREG (v8i64
	(VPRORVQZrr
	(v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
	(INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))),
	sub_ymm)>;

	def : Pat<(v4i32 (rotr (v4i32 VR128X:$src1), (v4i32 VR128X:$src2))),
	(EXTRACT_SUBREG (v16i32
	(VPRORVDZrr
	(v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
	(INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm))),
	sub_xmm)>;
	def : Pat<(v8i32 (rotr (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
	(EXTRACT_SUBREG (v16i32
	(VPRORVDZrr
	(v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
	(INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))),
	sub_ymm)>;

	def : Pat<(v2i64 (X86vrotri (v2i64 VR128X:$src1), (i8 imm:$src2))),
	(EXTRACT_SUBREG (v8i64
	(VPRORQZri
	(v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
	imm:$src2)), sub_xmm)>;
	def : Pat<(v4i64 (X86vrotri (v4i64 VR256X:$src1), (i8 imm:$src2))),
	(EXTRACT_SUBREG (v8i64
	(VPRORQZri
	(v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
	imm:$src2)), sub_ymm)>;

	def : Pat<(v4i32 (X86vrotri (v4i32 VR128X:$src1), (i8 imm:$src2))),
	(EXTRACT_SUBREG (v16i32
	(VPRORDZri
	(v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
	imm:$src2)), sub_xmm)>;
	def : Pat<(v8i32 (X86vrotri (v8i32 VR256X:$src1), (i8 imm:$src2))),
	(EXTRACT_SUBREG (v16i32
	(VPRORDZri
	(v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
	imm:$src2)), sub_ymm)>;
	}

	//===-------------------------------------------------------------------===//
	// 1-src variable permutation VPERMW/D/Q
	//===-------------------------------------------------------------------===//
	multiclass avx512_vperm_dq_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
	AVX512VLVectorVTInfo _> {
	let Predicates = [HasAVX512] in
	defm Z : avx512_var_shift<opc, OpcodeStr, OpNode, _.info512>,
	avx512_var_shift_mb<opc, OpcodeStr, OpNode, _.info512>, EVEX_V512;

	let Predicates = [HasAVX512, HasVLX] in
	defm Z256 : avx512_var_shift<opc, OpcodeStr, OpNode, _.info256>,
	avx512_var_shift_mb<opc, OpcodeStr, OpNode, _.info256>, EVEX_V256;
	}

	multiclass avx512_vpermi_dq_sizes<bits<8> opc, Format ImmFormR, Format ImmFormM,
	string OpcodeStr, SDNode OpNode,
	AVX512VLVectorVTInfo VTInfo> {
	let Predicates = [HasAVX512] in
	defm Z: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode,
	VTInfo.info512>,
	avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode,
	VTInfo.info512>, EVEX_V512;
	let Predicates = [HasAVX512, HasVLX] in
	defm Z256: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode,
	VTInfo.info256>,
	avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode,
	VTInfo.info256>, EVEX_V256;
	}

	multiclass avx512_vperm_bw<bits<8> opc, string OpcodeStr,
	Predicate prd, SDNode OpNode,
	AVX512VLVectorVTInfo _> {
	let Predicates = [prd] in
	defm Z: avx512_var_shift<opc, OpcodeStr, OpNode, _.info512>,
	EVEX_V512 ;
	let Predicates = [HasVLX, prd] in {
	defm Z256: avx512_var_shift<opc, OpcodeStr, OpNode, _.info256>,
	EVEX_V256 ;
	defm Z128: avx512_var_shift<opc, OpcodeStr, OpNode, _.info128>,
	EVEX_V128 ;
	}
	}

	defm VPERMW : avx512_vperm_bw<0x8D, "vpermw", HasBWI, X86VPermv,
	avx512vl_i16_info>, VEX_W;
	defm VPERMB : avx512_vperm_bw<0x8D, "vpermb", HasVBMI, X86VPermv,
	avx512vl_i8_info>;

	defm VPERMD : avx512_vperm_dq_sizes<0x36, "vpermd", X86VPermv,
	avx512vl_i32_info>;
	defm VPERMQ : avx512_vperm_dq_sizes<0x36, "vpermq", X86VPermv,
	avx512vl_i64_info>, VEX_W;
	defm VPERMPS : avx512_vperm_dq_sizes<0x16, "vpermps", X86VPermv,
	avx512vl_f32_info>;
	defm VPERMPD : avx512_vperm_dq_sizes<0x16, "vpermpd", X86VPermv,
	avx512vl_f64_info>, VEX_W;

	defm VPERMQ : avx512_vpermi_dq_sizes<0x00, MRMSrcReg, MRMSrcMem, "vpermq",
	X86VPermi, avx512vl_i64_info>,
	EVEX, AVX512AIi8Base, EVEX_CD8<64, CD8VF>, VEX_W;
	defm VPERMPD : avx512_vpermi_dq_sizes<0x01, MRMSrcReg, MRMSrcMem, "vpermpd",
	X86VPermi, avx512vl_f64_info>,
	EVEX, AVX512AIi8Base, EVEX_CD8<64, CD8VF>, VEX_W;
	//===----------------------------------------------------------------------===//
	// AVX-512 - VPERMIL
	//===----------------------------------------------------------------------===//

	multiclass avx512_permil_vec<bits<8> OpcVar, string OpcodeStr, SDNode OpNode,
	X86VectorVTInfo _, X86VectorVTInfo Ctrl> {
	defm rr: AVX512_maskable<OpcVar, MRMSrcReg, _, (outs _.RC:$dst),
	(ins _.RC:$src1, Ctrl.RC:$src2), OpcodeStr,
	"$src2, $src1", "$src1, $src2",
	(_.VT (OpNode _.RC:$src1,
	(Ctrl.VT Ctrl.RC:$src2)))>,
	T8PD, EVEX_4V;
	defm rm: AVX512_maskable<OpcVar, MRMSrcMem, _, (outs _.RC:$dst),
	(ins _.RC:$src1, Ctrl.MemOp:$src2), OpcodeStr,
	"$src2, $src1", "$src1, $src2",
	(_.VT (OpNode
	_.RC:$src1,
	(Ctrl.VT (bitconvert(Ctrl.LdFrag addr:$src2)))))>,
	T8PD, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
	defm rmb: AVX512_maskable<OpcVar, MRMSrcMem, _, (outs _.RC:$dst),
	(ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
	"${src2}"##_.BroadcastStr##", $src1",
	"$src1, ${src2}"##_.BroadcastStr,
	(_.VT (OpNode
	_.RC:$src1,
	(Ctrl.VT (X86VBroadcast
	(Ctrl.ScalarLdFrag addr:$src2)))))>,
	T8PD, EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>;
	}

	multiclass avx512_permil_vec_common<string OpcodeStr, bits<8> OpcVar,
	AVX512VLVectorVTInfo _, AVX512VLVectorVTInfo Ctrl>{
	let Predicates = [HasAVX512] in {
	defm Z : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, _.info512,
	Ctrl.info512>, EVEX_V512;
	}
	let Predicates = [HasAVX512, HasVLX] in {
	defm Z128 : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, _.info128,
	Ctrl.info128>, EVEX_V128;
	defm Z256 : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, _.info256,
	Ctrl.info256>, EVEX_V256;
	}
	}

	multiclass avx512_permil<string OpcodeStr, bits<8> OpcImm, bits<8> OpcVar,
	AVX512VLVectorVTInfo _, AVX512VLVectorVTInfo Ctrl>{

	defm NAME: avx512_permil_vec_common<OpcodeStr, OpcVar, _, Ctrl>;
	defm NAME: avx512_shift_rmi_sizes<OpcImm, MRMSrcReg, MRMSrcMem, OpcodeStr,
	X86VPermilpi, _>,
	EVEX, AVX512AIi8Base, EVEX_CD8<_.info128.EltSize, CD8VF>;
	}

	let ExeDomain = SSEPackedSingle in
	defm VPERMILPS : avx512_permil<"vpermilps", 0x04, 0x0C, avx512vl_f32_info,
	avx512vl_i32_info>;
	let ExeDomain = SSEPackedDouble in
	defm VPERMILPD : avx512_permil<"vpermilpd", 0x05, 0x0D, avx512vl_f64_info,
	avx512vl_i64_info>, VEX_W;
	//===----------------------------------------------------------------------===//
	// AVX-512 - VPSHUFD, VPSHUFLW, VPSHUFHW
	//===----------------------------------------------------------------------===//

	defm VPSHUFD : avx512_shift_rmi_sizes<0x70, MRMSrcReg, MRMSrcMem, "vpshufd",
	X86PShufd, avx512vl_i32_info>,
	EVEX, AVX512BIi8Base, EVEX_CD8<32, CD8VF>;
	defm VPSHUFH : avx512_shift_rmi_w<0x70, MRMSrcReg, MRMSrcMem, "vpshufhw",
	X86PShufhw>, EVEX, AVX512XSIi8Base;
	defm VPSHUFL : avx512_shift_rmi_w<0x70, MRMSrcReg, MRMSrcMem, "vpshuflw",
	X86PShuflw>, EVEX, AVX512XDIi8Base;

	multiclass avx512_pshufb_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode> {
	let Predicates = [HasBWI] in
	defm Z: avx512_var_shift<opc, OpcodeStr, OpNode, v64i8_info>, EVEX_V512;

	let Predicates = [HasVLX, HasBWI] in {
	defm Z256: avx512_var_shift<opc, OpcodeStr, OpNode, v32i8x_info>, EVEX_V256;
	defm Z128: avx512_var_shift<opc, OpcodeStr, OpNode, v16i8x_info>, EVEX_V128;
	}
	}

	defm VPSHUFB: avx512_pshufb_sizes<0x00, "vpshufb", X86pshufb>;

	//===----------------------------------------------------------------------===//
	// Move Low to High and High to Low packed FP Instructions
	//===----------------------------------------------------------------------===//
	def VMOVLHPSZrr : AVX512PSI<0x16, MRMSrcReg, (outs VR128X:$dst),
	(ins VR128X:$src1, VR128X:$src2),
	"vmovlhps\t{$src2, $src1, $dst\|$dst, $src1, $src2}",
	[(set VR128X:$dst, (v4f32 (X86Movlhps VR128X:$src1, VR128X:$src2)))],
	IIC_SSE_MOV_LH>, EVEX_4V;
	def VMOVHLPSZrr : AVX512PSI<0x12, MRMSrcReg, (outs VR128X:$dst),
	(ins VR128X:$src1, VR128X:$src2),
	"vmovhlps\t{$src2, $src1, $dst\|$dst, $src1, $src2}",
	[(set VR128X:$dst, (v4f32 (X86Movhlps VR128X:$src1, VR128X:$src2)))],
	IIC_SSE_MOV_LH>, EVEX_4V;

	let Predicates = [HasAVX512] in {
	// MOVLHPS patterns
	def : Pat<(v4i32 (X86Movlhps VR128X:$src1, VR128X:$src2)),
	(VMOVLHPSZrr VR128X:$src1, VR128X:$src2)>;
	def : Pat<(v2i64 (X86Movlhps VR128X:$src1, VR128X:$src2)),
	(VMOVLHPSZrr (v2i64 VR128X:$src1), VR128X:$src2)>;

	// MOVHLPS patterns
	def : Pat<(v4i32 (X86Movhlps VR128X:$src1, VR128X:$src2)),
	(VMOVHLPSZrr VR128X:$src1, VR128X:$src2)>;
	}

	//===----------------------------------------------------------------------===//
	// VMOVHPS/PD VMOVLPS Instructions
	// All patterns was taken from SSS implementation.
	//===----------------------------------------------------------------------===//
	multiclass avx512_mov_hilo_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
	X86VectorVTInfo _> {
	let ExeDomain = _.ExeDomain in
	def rm : AVX512<opc, MRMSrcMem, (outs _.RC:$dst),
	(ins _.RC:$src1, f64mem:$src2),
	!strconcat(OpcodeStr,
	"\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set _.RC:$dst,
	(OpNode _.RC:$src1,
	(_.VT (bitconvert
	(v2f64 (scalar_to_vector (loadf64 addr:$src2)))))))],
	IIC_SSE_MOV_LH>, EVEX_4V;
	}

	defm VMOVHPSZ128 : avx512_mov_hilo_packed<0x16, "vmovhps", X86Movlhps,
	v4f32x_info>, EVEX_CD8<32, CD8VT2>, PS;
	defm VMOVHPDZ128 : avx512_mov_hilo_packed<0x16, "vmovhpd", X86Movlhpd,
	v2f64x_info>, EVEX_CD8<64, CD8VT1>, PD, VEX_W;
	defm VMOVLPSZ128 : avx512_mov_hilo_packed<0x12, "vmovlps", X86Movlps,
	v4f32x_info>, EVEX_CD8<32, CD8VT2>, PS;
	defm VMOVLPDZ128 : avx512_mov_hilo_packed<0x12, "vmovlpd", X86Movlpd,
	v2f64x_info>, EVEX_CD8<64, CD8VT1>, PD, VEX_W;

	let Predicates = [HasAVX512] in {
	// VMOVHPS patterns
	def : Pat<(X86Movlhps VR128X:$src1,
	(bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
	(VMOVHPSZ128rm VR128X:$src1, addr:$src2)>;
	def : Pat<(X86Movlhps VR128X:$src1,
	(bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
	(VMOVHPSZ128rm VR128X:$src1, addr:$src2)>;
	// VMOVHPD patterns
	def : Pat<(v2f64 (X86Unpckl VR128X:$src1,
	(scalar_to_vector (loadf64 addr:$src2)))),
	(VMOVHPDZ128rm VR128X:$src1, addr:$src2)>;
	def : Pat<(v2f64 (X86Unpckl VR128X:$src1,
	(bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
	(VMOVHPDZ128rm VR128X:$src1, addr:$src2)>;
	// VMOVLPS patterns
	def : Pat<(v4f32 (X86Movlps VR128X:$src1, (load addr:$src2))),
	(VMOVLPSZ128rm VR128X:$src1, addr:$src2)>;
	def : Pat<(v4i32 (X86Movlps VR128X:$src1, (load addr:$src2))),
	(VMOVLPSZ128rm VR128X:$src1, addr:$src2)>;
	// VMOVLPD patterns
	def : Pat<(v2f64 (X86Movlpd VR128X:$src1, (load addr:$src2))),
	(VMOVLPDZ128rm VR128X:$src1, addr:$src2)>;
	def : Pat<(v2i64 (X86Movlpd VR128X:$src1, (load addr:$src2))),
	(VMOVLPDZ128rm VR128X:$src1, addr:$src2)>;
	def : Pat<(v2f64 (X86Movsd VR128X:$src1,
	(v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
	(VMOVLPDZ128rm VR128X:$src1, addr:$src2)>;
	}

	def VMOVHPSZ128mr : AVX512PSI<0x17, MRMDestMem, (outs),
	(ins f64mem:$dst, VR128X:$src),
	"vmovhps\t{$src, $dst\|$dst, $src}",
	[(store (f64 (extractelt
	(X86Unpckh (bc_v2f64 (v4f32 VR128X:$src)),
	(bc_v2f64 (v4f32 VR128X:$src))),
	(iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>,
	EVEX, EVEX_CD8<32, CD8VT2>;
	def VMOVHPDZ128mr : AVX512PDI<0x17, MRMDestMem, (outs),
	(ins f64mem:$dst, VR128X:$src),
	"vmovhpd\t{$src, $dst\|$dst, $src}",
	[(store (f64 (extractelt
	(v2f64 (X86Unpckh VR128X:$src, VR128X:$src)),
	(iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>,
	EVEX, EVEX_CD8<64, CD8VT1>, VEX_W;
	def VMOVLPSZ128mr : AVX512PSI<0x13, MRMDestMem, (outs),
	(ins f64mem:$dst, VR128X:$src),
	"vmovlps\t{$src, $dst\|$dst, $src}",
	[(store (f64 (extractelt (bc_v2f64 (v4f32 VR128X:$src)),
	(iPTR 0))), addr:$dst)],
	IIC_SSE_MOV_LH>,
	EVEX, EVEX_CD8<32, CD8VT2>;
	def VMOVLPDZ128mr : AVX512PDI<0x13, MRMDestMem, (outs),
	(ins f64mem:$dst, VR128X:$src),
	"vmovlpd\t{$src, $dst\|$dst, $src}",
	[(store (f64 (extractelt (v2f64 VR128X:$src),
	(iPTR 0))), addr:$dst)],
	IIC_SSE_MOV_LH>,
	EVEX, EVEX_CD8<64, CD8VT1>, VEX_W;

	let Predicates = [HasAVX512] in {
	// VMOVHPD patterns
	def : Pat<(store (f64 (extractelt
	(v2f64 (X86VPermilpi VR128X:$src, (i8 1))),
	(iPTR 0))), addr:$dst),
	(VMOVHPDZ128mr addr:$dst, VR128X:$src)>;
	// VMOVLPS patterns
	def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128X:$src2)),
	addr:$src1),
	(VMOVLPSZ128mr addr:$src1, VR128X:$src2)>;
	def : Pat<(store (v4i32 (X86Movlps
	(bc_v4i32 (loadv2i64 addr:$src1)), VR128X:$src2)), addr:$src1),
	(VMOVLPSZ128mr addr:$src1, VR128X:$src2)>;
	// VMOVLPD patterns
	def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128X:$src2)),
	addr:$src1),
	(VMOVLPDZ128mr addr:$src1, VR128X:$src2)>;
	def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128X:$src2)),
	addr:$src1),
	(VMOVLPDZ128mr addr:$src1, VR128X:$src2)>;
	}
	//===----------------------------------------------------------------------===//
	// FMA - Fused Multiply Operations
	//

	multiclass avx512_fma3p_213_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
	X86VectorVTInfo _, string Suff> {
	let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
	defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
	(ins _.RC:$src2, _.RC:$src3),
	OpcodeStr, "$src3, $src2", "$src2, $src3",
	(_.VT (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3)), 1, 1>,
	AVX512FMA3Base;

	defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
	(ins _.RC:$src2, _.MemOp:$src3),
	OpcodeStr, "$src3, $src2", "$src2, $src3",
	(_.VT (OpNode _.RC:$src2, _.RC:$src1, (_.LdFrag addr:$src3))), 1, 0>,
	AVX512FMA3Base;

	defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
	(ins _.RC:$src2, _.ScalarMemOp:$src3),
	OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"),
	!strconcat("$src2, ${src3}", _.BroadcastStr ),
	(OpNode _.RC:$src2,
	_.RC:$src1,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))), 1, 0>,
	AVX512FMA3Base, EVEX_B;
	}

	// Additional pattern for folding broadcast nodes in other orders.
	def : Pat<(_.VT (vselect _.KRCWM:$mask,
	(OpNode _.RC:$src1, _.RC:$src2,
	(X86VBroadcast (_.ScalarLdFrag addr:$src3))),
	_.RC:$src1)),
	(!cast<Instruction>(NAME#Suff#_.ZSuffix#mbk) _.RC:$src1,
	_.KRCWM:$mask, _.RC:$src2, addr:$src3)>;
	}

	multiclass avx512_fma3_213_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
	X86VectorVTInfo _, string Suff> {
	let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in
	defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
	(ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
	OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
	(_.VT ( OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 imm:$rc))), 1, 1>,
	AVX512FMA3Base, EVEX_B, EVEX_RC;
	}

	multiclass avx512_fma3p_213_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
	SDNode OpNodeRnd, AVX512VLVectorVTInfo _,
	string Suff> {
	let Predicates = [HasAVX512] in {
	defm Z : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, _.info512, Suff>,
	avx512_fma3_213_round<opc, OpcodeStr, OpNodeRnd, _.info512,
	Suff>, EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>;
	}
	let Predicates = [HasVLX, HasAVX512] in {
	defm Z256 : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, _.info256, Suff>,
	EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>;
	defm Z128 : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, _.info128, Suff>,
	EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>;
	}
	}

	multiclass avx512_fma3p_213_f<bits<8> opc, string OpcodeStr, SDNode OpNode,
	SDNode OpNodeRnd > {
	defm PS : avx512_fma3p_213_common<opc, OpcodeStr#"ps", OpNode, OpNodeRnd,
	avx512vl_f32_info, "PS">;
	defm PD : avx512_fma3p_213_common<opc, OpcodeStr#"pd", OpNode, OpNodeRnd,
	avx512vl_f64_info, "PD">, VEX_W;
	}

	defm VFMADD213 : avx512_fma3p_213_f<0xA8, "vfmadd213", X86Fmadd, X86FmaddRnd>;
	defm VFMSUB213 : avx512_fma3p_213_f<0xAA, "vfmsub213", X86Fmsub, X86FmsubRnd>;
	defm VFMADDSUB213 : avx512_fma3p_213_f<0xA6, "vfmaddsub213", X86Fmaddsub, X86FmaddsubRnd>;
	defm VFMSUBADD213 : avx512_fma3p_213_f<0xA7, "vfmsubadd213", X86Fmsubadd, X86FmsubaddRnd>;
	defm VFNMADD213 : avx512_fma3p_213_f<0xAC, "vfnmadd213", X86Fnmadd, X86FnmaddRnd>;
	defm VFNMSUB213 : avx512_fma3p_213_f<0xAE, "vfnmsub213", X86Fnmsub, X86FnmsubRnd>;


	multiclass avx512_fma3p_231_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
	X86VectorVTInfo _, string Suff> {
	let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
	defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
	(ins _.RC:$src2, _.RC:$src3),
	OpcodeStr, "$src3, $src2", "$src2, $src3",
	(_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), 1, 1>,
	AVX512FMA3Base;

	defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
	(ins _.RC:$src2, _.MemOp:$src3),
	OpcodeStr, "$src3, $src2", "$src2, $src3",
	(_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1)), 1, 0>,
	AVX512FMA3Base;

	defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
	(ins _.RC:$src2, _.ScalarMemOp:$src3),
	OpcodeStr, "${src3}"##_.BroadcastStr##", $src2",
	"$src2, ${src3}"##_.BroadcastStr,
	(_.VT (OpNode _.RC:$src2,
	(_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))),
	_.RC:$src1)), 1, 0>, AVX512FMA3Base, EVEX_B;
	}

	// Additional patterns for folding broadcast nodes in other orders.
	def : Pat<(_.VT (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
	_.RC:$src2, _.RC:$src1)),
	(!cast<Instruction>(NAME#Suff#_.ZSuffix#mb) _.RC:$src1,
	_.RC:$src2, addr:$src3)>;
	def : Pat<(_.VT (vselect _.KRCWM:$mask,
	(OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
	_.RC:$src2, _.RC:$src1),
	_.RC:$src1)),
	(!cast<Instruction>(NAME#Suff#_.ZSuffix#mbk) _.RC:$src1,
	_.KRCWM:$mask, _.RC:$src2, addr:$src3)>;
	def : Pat<(_.VT (vselect _.KRCWM:$mask,
	(OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
	_.RC:$src2, _.RC:$src1),
	_.ImmAllZerosV)),
	(!cast<Instruction>(NAME#Suff#_.ZSuffix#mbkz) _.RC:$src1,
	_.KRCWM:$mask, _.RC:$src2, addr:$src3)>;
	}

	multiclass avx512_fma3_231_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
	X86VectorVTInfo _, string Suff> {
	let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in
	defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
	(ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
	OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
	(_.VT ( OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 imm:$rc))), 1, 1>,
	AVX512FMA3Base, EVEX_B, EVEX_RC;
	}

	multiclass avx512_fma3p_231_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
	SDNode OpNodeRnd, AVX512VLVectorVTInfo _,
	string Suff> {
	let Predicates = [HasAVX512] in {
	defm Z : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, _.info512, Suff>,
	avx512_fma3_231_round<opc, OpcodeStr, OpNodeRnd, _.info512,
	Suff>, EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>;
	}
	let Predicates = [HasVLX, HasAVX512] in {
	defm Z256 : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, _.info256, Suff>,
	EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>;
	defm Z128 : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, _.info128, Suff>,
	EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>;
	}
	}

	multiclass avx512_fma3p_231_f<bits<8> opc, string OpcodeStr, SDNode OpNode,
	SDNode OpNodeRnd > {
	defm PS : avx512_fma3p_231_common<opc, OpcodeStr#"ps", OpNode, OpNodeRnd,
	avx512vl_f32_info, "PS">;
	defm PD : avx512_fma3p_231_common<opc, OpcodeStr#"pd", OpNode, OpNodeRnd,
	avx512vl_f64_info, "PD">, VEX_W;
	}

	defm VFMADD231 : avx512_fma3p_231_f<0xB8, "vfmadd231", X86Fmadd, X86FmaddRnd>;
	defm VFMSUB231 : avx512_fma3p_231_f<0xBA, "vfmsub231", X86Fmsub, X86FmsubRnd>;
	defm VFMADDSUB231 : avx512_fma3p_231_f<0xB6, "vfmaddsub231", X86Fmaddsub, X86FmaddsubRnd>;
	defm VFMSUBADD231 : avx512_fma3p_231_f<0xB7, "vfmsubadd231", X86Fmsubadd, X86FmsubaddRnd>;
	defm VFNMADD231 : avx512_fma3p_231_f<0xBC, "vfnmadd231", X86Fnmadd, X86FnmaddRnd>;
	defm VFNMSUB231 : avx512_fma3p_231_f<0xBE, "vfnmsub231", X86Fnmsub, X86FnmsubRnd>;

	multiclass avx512_fma3p_132_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
	X86VectorVTInfo _, string Suff> {
	let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
	defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
	(ins _.RC:$src2, _.RC:$src3),
	OpcodeStr, "$src3, $src2", "$src2, $src3",
	(_.VT (OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2)), 1, 1>,
	AVX512FMA3Base;

	defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
	(ins _.RC:$src2, _.MemOp:$src3),
	OpcodeStr, "$src3, $src2", "$src2, $src3",
	(_.VT (OpNode _.RC:$src1, (_.LdFrag addr:$src3), _.RC:$src2)), 1, 0>,
	AVX512FMA3Base;

	defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
	(ins _.RC:$src2, _.ScalarMemOp:$src3),
	OpcodeStr, "${src3}"##_.BroadcastStr##", $src2",
	"$src2, ${src3}"##_.BroadcastStr,
	(_.VT (OpNode _.RC:$src1,
	(_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))),
	_.RC:$src2)), 1, 0>, AVX512FMA3Base, EVEX_B;
	}

	// Additional patterns for folding broadcast nodes in other orders.
	def : Pat<(_.VT (vselect _.KRCWM:$mask,
	(OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
	_.RC:$src1, _.RC:$src2),
	_.RC:$src1)),
	(!cast<Instruction>(NAME#Suff#_.ZSuffix#mbk) _.RC:$src1,
	_.KRCWM:$mask, _.RC:$src2, addr:$src3)>;
	}

	multiclass avx512_fma3_132_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
	X86VectorVTInfo _, string Suff> {
	let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in
	defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
	(ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
	OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
	(_.VT ( OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2, (i32 imm:$rc))), 1, 1>,
	AVX512FMA3Base, EVEX_B, EVEX_RC;
	}

	multiclass avx512_fma3p_132_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
	SDNode OpNodeRnd, AVX512VLVectorVTInfo _,
	string Suff> {
	let Predicates = [HasAVX512] in {
	defm Z : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, _.info512, Suff>,
	avx512_fma3_132_round<opc, OpcodeStr, OpNodeRnd, _.info512,
	Suff>, EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>;
	}
	let Predicates = [HasVLX, HasAVX512] in {
	defm Z256 : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, _.info256, Suff>,
	EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>;
	defm Z128 : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, _.info128, Suff>,
	EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>;
	}
	}

	multiclass avx512_fma3p_132_f<bits<8> opc, string OpcodeStr, SDNode OpNode,
	SDNode OpNodeRnd > {
	defm PS : avx512_fma3p_132_common<opc, OpcodeStr#"ps", OpNode, OpNodeRnd,
	avx512vl_f32_info, "PS">;
	defm PD : avx512_fma3p_132_common<opc, OpcodeStr#"pd", OpNode, OpNodeRnd,
	avx512vl_f64_info, "PD">, VEX_W;
	}

	defm VFMADD132 : avx512_fma3p_132_f<0x98, "vfmadd132", X86Fmadd, X86FmaddRnd>;
	defm VFMSUB132 : avx512_fma3p_132_f<0x9A, "vfmsub132", X86Fmsub, X86FmsubRnd>;
	defm VFMADDSUB132 : avx512_fma3p_132_f<0x96, "vfmaddsub132", X86Fmaddsub, X86FmaddsubRnd>;
	defm VFMSUBADD132 : avx512_fma3p_132_f<0x97, "vfmsubadd132", X86Fmsubadd, X86FmsubaddRnd>;
	defm VFNMADD132 : avx512_fma3p_132_f<0x9C, "vfnmadd132", X86Fnmadd, X86FnmaddRnd>;
	defm VFNMSUB132 : avx512_fma3p_132_f<0x9E, "vfnmsub132", X86Fnmsub, X86FnmsubRnd>;

	// Scalar FMA
	let Constraints = "$src1 = $dst" in {
	multiclass avx512_fma3s_common<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
	dag RHS_VEC_r, dag RHS_VEC_m, dag RHS_VEC_rb,
	dag RHS_r, dag RHS_m > {
	defm r_Int: AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
	(ins _.RC:$src2, _.RC:$src3), OpcodeStr,
	"$src3, $src2", "$src2, $src3", RHS_VEC_r, 1, 1>, AVX512FMA3Base;

	defm m_Int: AVX512_maskable_3src_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
	(ins _.RC:$src2, _.IntScalarMemOp:$src3), OpcodeStr,
	"$src3, $src2", "$src2, $src3", RHS_VEC_m, 1, 1>, AVX512FMA3Base;

	defm rb_Int: AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
	(ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
	OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", RHS_VEC_rb, 1, 1>,
	AVX512FMA3Base, EVEX_B, EVEX_RC;

	let isCodeGenOnly = 1, isCommutable = 1 in {
	def r : AVX512FMA3<opc, MRMSrcReg, (outs _.FRC:$dst),
	(ins _.FRC:$src1, _.FRC:$src2, _.FRC:$src3),
	!strconcat(OpcodeStr,
	"\t{$src3, $src2, $dst\|$dst, $src2, $src3}"),
	[RHS_r]>;
	def m : AVX512FMA3<opc, MRMSrcMem, (outs _.FRC:$dst),
	(ins _.FRC:$src1, _.FRC:$src2, _.ScalarMemOp:$src3),
	!strconcat(OpcodeStr,
	"\t{$src3, $src2, $dst\|$dst, $src2, $src3}"),
	[RHS_m]>;
	}// isCodeGenOnly = 1
	}
	}// Constraints = "$src1 = $dst"

	multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132,
	string OpcodeStr, SDNode OpNode, SDNode OpNodeRnds1,
	SDNode OpNodeRnds3, X86VectorVTInfo _ , string SUFF> {
	let ExeDomain = _.ExeDomain in {
	defm NAME#213#SUFF#Z: avx512_fma3s_common<opc213, OpcodeStr#"213"#_.Suffix , _ ,
	// Operands for intrinsic are in 123 order to preserve passthu
	// semantics.
	(_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src2, _.RC:$src3, (i32 FROUND_CURRENT))),
	(_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src2,
	_.ScalarIntMemCPat:$src3, (i32 FROUND_CURRENT))),
	(_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src2, _.RC:$src3,
	(i32 imm:$rc))),
	(set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src1,
	_.FRC:$src3))),
	(set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src1,
	(_.ScalarLdFrag addr:$src3))))>;

	defm NAME#231#SUFF#Z: avx512_fma3s_common<opc231, OpcodeStr#"231"#_.Suffix , _ ,
	(_.VT (OpNodeRnds3 _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 FROUND_CURRENT))),
	(_.VT (OpNodeRnds3 _.RC:$src2, _.ScalarIntMemCPat:$src3,
	_.RC:$src1, (i32 FROUND_CURRENT))),
	(_.VT ( OpNodeRnds3 _.RC:$src2, _.RC:$src3, _.RC:$src1,
	(i32 imm:$rc))),
	(set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src3,
	_.FRC:$src1))),
	(set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2,
	(_.ScalarLdFrag addr:$src3), _.FRC:$src1)))>;

	defm NAME#132#SUFF#Z: avx512_fma3s_common<opc132, OpcodeStr#"132"#_.Suffix , _ ,
	(_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src3, _.RC:$src2, (i32 FROUND_CURRENT))),
	(_.VT (OpNodeRnds1 _.RC:$src1, _.ScalarIntMemCPat:$src3,
	_.RC:$src2, (i32 FROUND_CURRENT))),
	(_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src3, _.RC:$src2,
	(i32 imm:$rc))),
	(set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src1, _.FRC:$src3,
	_.FRC:$src2))),
	(set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src1,
	(_.ScalarLdFrag addr:$src3), _.FRC:$src2)))>;
	}
	}

	multiclass avx512_fma3s<bits<8> opc213, bits<8> opc231, bits<8> opc132,
	string OpcodeStr, SDNode OpNode, SDNode OpNodeRnds1,
	SDNode OpNodeRnds3> {
	let Predicates = [HasAVX512] in {
	defm NAME : avx512_fma3s_all<opc213, opc231, opc132, OpcodeStr, OpNode,
	OpNodeRnds1, OpNodeRnds3, f32x_info, "SS">,
	EVEX_CD8<32, CD8VT1>, VEX_LIG;
	defm NAME : avx512_fma3s_all<opc213, opc231, opc132, OpcodeStr, OpNode,
	OpNodeRnds1, OpNodeRnds3, f64x_info, "SD">,
	EVEX_CD8<64, CD8VT1>, VEX_LIG, VEX_W;
	}
	}

	defm VFMADD : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", X86Fmadd, X86FmaddRnds1,
	X86FmaddRnds3>;
	defm VFMSUB : avx512_fma3s<0xAB, 0xBB, 0x9B, "vfmsub", X86Fmsub, X86FmsubRnds1,
	X86FmsubRnds3>;
	defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86Fnmadd,
	X86FnmaddRnds1, X86FnmaddRnds3>;
	defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86Fnmsub,
	X86FnmsubRnds1, X86FnmsubRnds3>;

	//===----------------------------------------------------------------------===//
	// AVX-512 Packed Multiply of Unsigned 52-bit Integers and Add the Low 52-bit IFMA
	//===----------------------------------------------------------------------===//
	let Constraints = "$src1 = $dst" in {
	multiclass avx512_pmadd52_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
	X86VectorVTInfo _> {
	let ExeDomain = _.ExeDomain in {
	defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
	(ins _.RC:$src2, _.RC:$src3),
	OpcodeStr, "$src3, $src2", "$src2, $src3",
	(_.VT (OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3))>,
	AVX512FMA3Base;

	defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
	(ins _.RC:$src2, _.MemOp:$src3),
	OpcodeStr, "$src3, $src2", "$src2, $src3",
	(_.VT (OpNode _.RC:$src1, _.RC:$src2, (_.LdFrag addr:$src3)))>,
	AVX512FMA3Base;

	defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
	(ins _.RC:$src2, _.ScalarMemOp:$src3),
	OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"),
	!strconcat("$src2, ${src3}", _.BroadcastStr ),
	(OpNode _.RC:$src1,
	_.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))>,
	AVX512FMA3Base, EVEX_B;
	}
	}
	} // Constraints = "$src1 = $dst"

	multiclass avx512_pmadd52_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
	AVX512VLVectorVTInfo _> {
	let Predicates = [HasIFMA] in {
	defm Z : avx512_pmadd52_rm<opc, OpcodeStr, OpNode, _.info512>,
	EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>;
	}
	let Predicates = [HasVLX, HasIFMA] in {
	defm Z256 : avx512_pmadd52_rm<opc, OpcodeStr, OpNode, _.info256>,
	EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>;
	defm Z128 : avx512_pmadd52_rm<opc, OpcodeStr, OpNode, _.info128>,
	EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>;
	}
	}

	defm VPMADD52LUQ : avx512_pmadd52_common<0xb4, "vpmadd52luq", x86vpmadd52l,
	avx512vl_i64_info>, VEX_W;
	defm VPMADD52HUQ : avx512_pmadd52_common<0xb5, "vpmadd52huq", x86vpmadd52h,
	avx512vl_i64_info>, VEX_W;

	//===----------------------------------------------------------------------===//
	// AVX-512 Scalar convert from sign integer to float/double
	//===----------------------------------------------------------------------===//

	multiclass avx512_vcvtsi<bits<8> opc, SDNode OpNode, RegisterClass SrcRC,
	X86VectorVTInfo DstVT, X86MemOperand x86memop,
	PatFrag ld_frag, string asm> {
	let hasSideEffects = 0 in {
	def rr : SI<opc, MRMSrcReg, (outs DstVT.FRC:$dst),
	(ins DstVT.FRC:$src1, SrcRC:$src),
	!strconcat(asm,"\t{$src, $src1, $dst\|$dst, $src1, $src}"), []>,
	EVEX_4V;
	let mayLoad = 1 in
	def rm : SI<opc, MRMSrcMem, (outs DstVT.FRC:$dst),
	(ins DstVT.FRC:$src1, x86memop:$src),
	!strconcat(asm,"\t{$src, $src1, $dst\|$dst, $src1, $src}"), []>,
	EVEX_4V;
	} // hasSideEffects = 0
	let isCodeGenOnly = 1 in {
	def rr_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst),
	(ins DstVT.RC:$src1, SrcRC:$src2),
	!strconcat(asm,"\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set DstVT.RC:$dst,
	(OpNode (DstVT.VT DstVT.RC:$src1),
	SrcRC:$src2,
	(i32 FROUND_CURRENT)))]>, EVEX_4V;

	def rm_Int : SI<opc, MRMSrcMem, (outs DstVT.RC:$dst),
	(ins DstVT.RC:$src1, x86memop:$src2),
	!strconcat(asm,"\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set DstVT.RC:$dst,
	(OpNode (DstVT.VT DstVT.RC:$src1),
	(ld_frag addr:$src2),
	(i32 FROUND_CURRENT)))]>, EVEX_4V;
	}//isCodeGenOnly = 1
	}

	multiclass avx512_vcvtsi_round<bits<8> opc, SDNode OpNode, RegisterClass SrcRC,
	X86VectorVTInfo DstVT, string asm> {
	def rrb_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst),
	(ins DstVT.RC:$src1, SrcRC:$src2, AVX512RC:$rc),
	!strconcat(asm,
	"\t{$src2, $rc, $src1, $dst\|$dst, $src1, $rc, $src2}"),
	[(set DstVT.RC:$dst,
	(OpNode (DstVT.VT DstVT.RC:$src1),
	SrcRC:$src2,
	(i32 imm:$rc)))]>, EVEX_4V, EVEX_B, EVEX_RC;
	}

	multiclass avx512_vcvtsi_common<bits<8> opc, SDNode OpNode, RegisterClass SrcRC,
	X86VectorVTInfo DstVT, X86MemOperand x86memop,
	PatFrag ld_frag, string asm> {
	defm NAME : avx512_vcvtsi_round<opc, OpNode, SrcRC, DstVT, asm>,
	avx512_vcvtsi<opc, OpNode, SrcRC, DstVT, x86memop, ld_frag, asm>,
	VEX_LIG;
	}

	let Predicates = [HasAVX512] in {
	defm VCVTSI2SSZ : avx512_vcvtsi_common<0x2A, X86SintToFpRnd, GR32,
	v4f32x_info, i32mem, loadi32, "cvtsi2ss{l}">,
	XS, EVEX_CD8<32, CD8VT1>;
	defm VCVTSI642SSZ: avx512_vcvtsi_common<0x2A, X86SintToFpRnd, GR64,
	v4f32x_info, i64mem, loadi64, "cvtsi2ss{q}">,
	XS, VEX_W, EVEX_CD8<64, CD8VT1>;
	defm VCVTSI2SDZ : avx512_vcvtsi_common<0x2A, X86SintToFpRnd, GR32,
	v2f64x_info, i32mem, loadi32, "cvtsi2sd{l}">,
	XD, EVEX_CD8<32, CD8VT1>;
	defm VCVTSI642SDZ: avx512_vcvtsi_common<0x2A, X86SintToFpRnd, GR64,
	v2f64x_info, i64mem, loadi64, "cvtsi2sd{q}">,
	XD, VEX_W, EVEX_CD8<64, CD8VT1>;

	def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst\|$dst, $src1, $src}",
	(VCVTSI2SSZrm FR64X:$dst, FR64X:$src1, i32mem:$src), 0>;
	def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst\|$dst, $src1, $src}",
	(VCVTSI2SDZrm FR64X:$dst, FR64X:$src1, i32mem:$src), 0>;

	def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
	(VCVTSI2SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>;
	def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))),
	(VCVTSI642SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>;
	def : Pat<(f64 (sint_to_fp (loadi32 addr:$src))),
	(VCVTSI2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>;
	def : Pat<(f64 (sint_to_fp (loadi64 addr:$src))),
	(VCVTSI642SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>;

	def : Pat<(f32 (sint_to_fp GR32:$src)),
	(VCVTSI2SSZrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
	def : Pat<(f32 (sint_to_fp GR64:$src)),
	(VCVTSI642SSZrr (f32 (IMPLICIT_DEF)), GR64:$src)>;
	def : Pat<(f64 (sint_to_fp GR32:$src)),
	(VCVTSI2SDZrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
	def : Pat<(f64 (sint_to_fp GR64:$src)),
	(VCVTSI642SDZrr (f64 (IMPLICIT_DEF)), GR64:$src)>;

	defm VCVTUSI2SSZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, GR32,
	v4f32x_info, i32mem, loadi32,
	"cvtusi2ss{l}">, XS, EVEX_CD8<32, CD8VT1>;
	defm VCVTUSI642SSZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, GR64,
	v4f32x_info, i64mem, loadi64, "cvtusi2ss{q}">,
	XS, VEX_W, EVEX_CD8<64, CD8VT1>;
	defm VCVTUSI2SDZ : avx512_vcvtsi<0x7B, X86UintToFpRnd, GR32, v2f64x_info,
	i32mem, loadi32, "cvtusi2sd{l}">,
	XD, VEX_LIG, EVEX_CD8<32, CD8VT1>;
	defm VCVTUSI642SDZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, GR64,
	v2f64x_info, i64mem, loadi64, "cvtusi2sd{q}">,
	XD, VEX_W, EVEX_CD8<64, CD8VT1>;

	def : InstAlias<"vcvtusi2ss\t{$src, $src1, $dst\|$dst, $src1, $src}",
	(VCVTUSI2SSZrm FR64X:$dst, FR64X:$src1, i32mem:$src), 0>;
	def : InstAlias<"vcvtusi2sd\t{$src, $src1, $dst\|$dst, $src1, $src}",
	(VCVTUSI2SDZrm FR64X:$dst, FR64X:$src1, i32mem:$src), 0>;

	def : Pat<(f32 (uint_to_fp (loadi32 addr:$src))),
	(VCVTUSI2SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>;
	def : Pat<(f32 (uint_to_fp (loadi64 addr:$src))),
	(VCVTUSI642SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>;
	def : Pat<(f64 (uint_to_fp (loadi32 addr:$src))),
	(VCVTUSI2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>;
	def : Pat<(f64 (uint_to_fp (loadi64 addr:$src))),
	(VCVTUSI642SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>;

	def : Pat<(f32 (uint_to_fp GR32:$src)),
	(VCVTUSI2SSZrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
	def : Pat<(f32 (uint_to_fp GR64:$src)),
	(VCVTUSI642SSZrr (f32 (IMPLICIT_DEF)), GR64:$src)>;
	def : Pat<(f64 (uint_to_fp GR32:$src)),
	(VCVTUSI2SDZrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
	def : Pat<(f64 (uint_to_fp GR64:$src)),
	(VCVTUSI642SDZrr (f64 (IMPLICIT_DEF)), GR64:$src)>;
	}

	//===----------------------------------------------------------------------===//
	// AVX-512 Scalar convert from float/double to integer
	//===----------------------------------------------------------------------===//
	multiclass avx512_cvt_s_int_round<bits<8> opc, X86VectorVTInfo SrcVT ,
	X86VectorVTInfo DstVT, SDNode OpNode, string asm> {
	let Predicates = [HasAVX512] in {
	def rr : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst), (ins SrcVT.RC:$src),
	!strconcat(asm,"\t{$src, $dst\|$dst, $src}"),
	[(set DstVT.RC:$dst, (OpNode (SrcVT.VT SrcVT.RC:$src),(i32 FROUND_CURRENT)))]>,
	EVEX, VEX_LIG;
	def rb : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst), (ins SrcVT.RC:$src, AVX512RC:$rc),
	!strconcat(asm,"\t{$rc, $src, $dst\|$dst, $src, $rc}"),
	[(set DstVT.RC:$dst, (OpNode (SrcVT.VT SrcVT.RC:$src),(i32 imm:$rc)))]>,
	EVEX, VEX_LIG, EVEX_B, EVEX_RC;
	def rm : SI<opc, MRMSrcMem, (outs DstVT.RC:$dst), (ins SrcVT.IntScalarMemOp:$src),
	!strconcat(asm,"\t{$src, $dst\|$dst, $src}"),
	[(set DstVT.RC:$dst, (OpNode
	(SrcVT.VT SrcVT.ScalarIntMemCPat:$src),
	(i32 FROUND_CURRENT)))]>,
	EVEX, VEX_LIG;
	} // Predicates = [HasAVX512]
	}

	// Convert float/double to signed/unsigned int 32/64
	defm VCVTSS2SIZ: avx512_cvt_s_int_round<0x2D, f32x_info, i32x_info,
	X86cvts2si, "cvtss2si">,
	XS, EVEX_CD8<32, CD8VT1>;
	defm VCVTSS2SI64Z: avx512_cvt_s_int_round<0x2D, f32x_info, i64x_info,
	X86cvts2si, "cvtss2si">,
	XS, VEX_W, EVEX_CD8<32, CD8VT1>;
	defm VCVTSS2USIZ: avx512_cvt_s_int_round<0x79, f32x_info, i32x_info,
	X86cvts2usi, "cvtss2usi">,
	XS, EVEX_CD8<32, CD8VT1>;
	defm VCVTSS2USI64Z: avx512_cvt_s_int_round<0x79, f32x_info, i64x_info,
	X86cvts2usi, "cvtss2usi">, XS, VEX_W,
	EVEX_CD8<32, CD8VT1>;
	defm VCVTSD2SIZ: avx512_cvt_s_int_round<0x2D, f64x_info, i32x_info,
	X86cvts2si, "cvtsd2si">,
	XD, EVEX_CD8<64, CD8VT1>;
	defm VCVTSD2SI64Z: avx512_cvt_s_int_round<0x2D, f64x_info, i64x_info,
	X86cvts2si, "cvtsd2si">,
	XD, VEX_W, EVEX_CD8<64, CD8VT1>;
	defm VCVTSD2USIZ: avx512_cvt_s_int_round<0x79, f64x_info, i32x_info,
	X86cvts2usi, "cvtsd2usi">,
	XD, EVEX_CD8<64, CD8VT1>;
	defm VCVTSD2USI64Z: avx512_cvt_s_int_round<0x79, f64x_info, i64x_info,
	X86cvts2usi, "cvtsd2usi">, XD, VEX_W,
	EVEX_CD8<64, CD8VT1>;

	// The SSE version of these instructions are disabled for AVX512.
	// Therefore, the SSE intrinsics are mapped to the AVX512 instructions.
	let Predicates = [HasAVX512] in {
	def : Pat<(i32 (int_x86_sse_cvtss2si (v4f32 VR128X:$src))),
	(VCVTSS2SIZrr VR128X:$src)>;
	def : Pat<(i32 (int_x86_sse_cvtss2si sse_load_f32:$src)),
	(VCVTSS2SIZrm sse_load_f32:$src)>;
	def : Pat<(i64 (int_x86_sse_cvtss2si64 (v4f32 VR128X:$src))),
	(VCVTSS2SI64Zrr VR128X:$src)>;
	def : Pat<(i64 (int_x86_sse_cvtss2si64 sse_load_f32:$src)),
	(VCVTSS2SI64Zrm sse_load_f32:$src)>;
	def : Pat<(i32 (int_x86_sse2_cvtsd2si (v2f64 VR128X:$src))),
	(VCVTSD2SIZrr VR128X:$src)>;
	def : Pat<(i32 (int_x86_sse2_cvtsd2si sse_load_f64:$src)),
	(VCVTSD2SIZrm sse_load_f64:$src)>;
	def : Pat<(i64 (int_x86_sse2_cvtsd2si64 (v2f64 VR128X:$src))),
	(VCVTSD2SI64Zrr VR128X:$src)>;
	def : Pat<(i64 (int_x86_sse2_cvtsd2si64 sse_load_f64:$src)),
	(VCVTSD2SI64Zrm sse_load_f64:$src)>;
	} // HasAVX512

	let Predicates = [HasAVX512] in {
	def : Pat<(int_x86_sse_cvtsi2ss VR128X:$src1, GR32:$src2),
	(VCVTSI2SSZrr_Int VR128X:$src1, GR32:$src2)>;
	def : Pat<(int_x86_sse_cvtsi2ss VR128X:$src1, (loadi32 addr:$src2)),
	(VCVTSI2SSZrm_Int VR128X:$src1, addr:$src2)>;
	def : Pat<(int_x86_sse_cvtsi642ss VR128X:$src1, GR64:$src2),
	(VCVTSI642SSZrr_Int VR128X:$src1, GR64:$src2)>;
	def : Pat<(int_x86_sse_cvtsi642ss VR128X:$src1, (loadi64 addr:$src2)),
	(VCVTSI642SSZrm_Int VR128X:$src1, addr:$src2)>;
	def : Pat<(int_x86_sse2_cvtsi2sd VR128X:$src1, GR32:$src2),
	(VCVTSI2SDZrr_Int VR128X:$src1, GR32:$src2)>;
	def : Pat<(int_x86_sse2_cvtsi2sd VR128X:$src1, (loadi32 addr:$src2)),
	(VCVTSI2SDZrm_Int VR128X:$src1, addr:$src2)>;
	def : Pat<(int_x86_sse2_cvtsi642sd VR128X:$src1, GR64:$src2),
	(VCVTSI642SDZrr_Int VR128X:$src1, GR64:$src2)>;
	def : Pat<(int_x86_sse2_cvtsi642sd VR128X:$src1, (loadi64 addr:$src2)),
	(VCVTSI642SDZrm_Int VR128X:$src1, addr:$src2)>;
	def : Pat<(int_x86_avx512_cvtusi2sd VR128X:$src1, GR32:$src2),
	(VCVTUSI2SDZrr_Int VR128X:$src1, GR32:$src2)>;
	def : Pat<(int_x86_avx512_cvtusi2sd VR128X:$src1, (loadi32 addr:$src2)),
	(VCVTUSI2SDZrm_Int VR128X:$src1, addr:$src2)>;
	} // Predicates = [HasAVX512]

	// Patterns used for matching vcvtsi2s{s,d} intrinsic sequences from clang
	// which produce unnecessary vmovs{s,d} instructions
	let Predicates = [HasAVX512] in {
	def : Pat<(v4f32 (X86Movss
	(v4f32 VR128X:$dst),
	(v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))),
	(VCVTSI642SSZrr_Int VR128X:$dst, GR64:$src)>;

	def : Pat<(v4f32 (X86Movss
	(v4f32 VR128X:$dst),
	(v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))),
	(VCVTSI2SSZrr_Int VR128X:$dst, GR32:$src)>;

	def : Pat<(v2f64 (X86Movsd
	(v2f64 VR128X:$dst),
	(v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))),
	(VCVTSI642SDZrr_Int VR128X:$dst, GR64:$src)>;

	def : Pat<(v2f64 (X86Movsd
	(v2f64 VR128X:$dst),
	(v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))),
	(VCVTSI2SDZrr_Int VR128X:$dst, GR32:$src)>;
	} // Predicates = [HasAVX512]

	// Convert float/double to signed/unsigned int 32/64 with truncation
	multiclass avx512_cvt_s_all<bits<8> opc, string asm, X86VectorVTInfo _SrcRC,
	X86VectorVTInfo _DstRC, SDNode OpNode,
	SDNode OpNodeRnd, string aliasStr>{
	let Predicates = [HasAVX512] in {
	def rr : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.FRC:$src),
	!strconcat(asm,"\t{$src, $dst\|$dst, $src}"),
	[(set _DstRC.RC:$dst, (OpNode _SrcRC.FRC:$src))]>, EVEX;
	let hasSideEffects = 0 in
	def rb : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.FRC:$src),
	!strconcat(asm,"\t{{sae}, $src, $dst\|$dst, $src, {sae}}"),
	[]>, EVEX, EVEX_B;
	def rm : AVX512<opc, MRMSrcMem, (outs _DstRC.RC:$dst), (ins _SrcRC.ScalarMemOp:$src),
	!strconcat(asm,"\t{$src, $dst\|$dst, $src}"),
	[(set _DstRC.RC:$dst, (OpNode (_SrcRC.ScalarLdFrag addr:$src)))]>,
	EVEX;

	def : InstAlias<asm # aliasStr # "\t{$src, $dst\|$dst, $src}",
	(!cast<Instruction>(NAME # "rr") _DstRC.RC:$dst, _SrcRC.FRC:$src), 0>;
	def : InstAlias<asm # aliasStr # "\t\t{{sae}, $src, $dst\|$dst, $src, {sae}}",
	(!cast<Instruction>(NAME # "rb") _DstRC.RC:$dst, _SrcRC.FRC:$src), 0>;
	def : InstAlias<asm # aliasStr # "\t{$src, $dst\|$dst, $src}",
	(!cast<Instruction>(NAME # "rm") _DstRC.RC:$dst,
	_SrcRC.ScalarMemOp:$src), 0>;

	let isCodeGenOnly = 1 in {
	def rr_Int : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src),
	!strconcat(asm,"\t{$src, $dst\|$dst, $src}"),
	[(set _DstRC.RC:$dst, (OpNodeRnd (_SrcRC.VT _SrcRC.RC:$src),
	(i32 FROUND_CURRENT)))]>, EVEX, VEX_LIG;
	def rb_Int : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src),
	!strconcat(asm,"\t{{sae}, $src, $dst\|$dst, $src, {sae}}"),
	[(set _DstRC.RC:$dst, (OpNodeRnd (_SrcRC.VT _SrcRC.RC:$src),
	(i32 FROUND_NO_EXC)))]>,
	EVEX,VEX_LIG , EVEX_B;
	let mayLoad = 1, hasSideEffects = 0 in
	def rm_Int : AVX512<opc, MRMSrcMem, (outs _DstRC.RC:$dst),
	(ins _SrcRC.IntScalarMemOp:$src),
	!strconcat(asm,"\t{$src, $dst\|$dst, $src}"),
	[]>, EVEX, VEX_LIG;

	} // isCodeGenOnly = 1
	} //HasAVX512
	}


	defm VCVTTSS2SIZ: avx512_cvt_s_all<0x2C, "vcvttss2si", f32x_info, i32x_info,
	fp_to_sint, X86cvtts2IntRnd, "{l}">,
	XS, EVEX_CD8<32, CD8VT1>;
	defm VCVTTSS2SI64Z: avx512_cvt_s_all<0x2C, "vcvttss2si", f32x_info, i64x_info,
	fp_to_sint, X86cvtts2IntRnd, "{q}">,
	VEX_W, XS, EVEX_CD8<32, CD8VT1>;
	defm VCVTTSD2SIZ: avx512_cvt_s_all<0x2C, "vcvttsd2si", f64x_info, i32x_info,
	fp_to_sint, X86cvtts2IntRnd, "{l}">,
	XD, EVEX_CD8<64, CD8VT1>;
	defm VCVTTSD2SI64Z: avx512_cvt_s_all<0x2C, "vcvttsd2si", f64x_info, i64x_info,
	fp_to_sint, X86cvtts2IntRnd, "{q}">,
	VEX_W, XD, EVEX_CD8<64, CD8VT1>;

	defm VCVTTSS2USIZ: avx512_cvt_s_all<0x78, "vcvttss2usi", f32x_info, i32x_info,
	fp_to_uint, X86cvtts2UIntRnd, "{l}">,
	XS, EVEX_CD8<32, CD8VT1>;
	defm VCVTTSS2USI64Z: avx512_cvt_s_all<0x78, "vcvttss2usi", f32x_info, i64x_info,
	fp_to_uint, X86cvtts2UIntRnd, "{q}">,
	XS,VEX_W, EVEX_CD8<32, CD8VT1>;
	defm VCVTTSD2USIZ: avx512_cvt_s_all<0x78, "vcvttsd2usi", f64x_info, i32x_info,
	fp_to_uint, X86cvtts2UIntRnd, "{l}">,
	XD, EVEX_CD8<64, CD8VT1>;
	defm VCVTTSD2USI64Z: avx512_cvt_s_all<0x78, "vcvttsd2usi", f64x_info, i64x_info,
	fp_to_uint, X86cvtts2UIntRnd, "{q}">,
	XD, VEX_W, EVEX_CD8<64, CD8VT1>;
	let Predicates = [HasAVX512] in {
	def : Pat<(i32 (int_x86_sse_cvttss2si (v4f32 VR128X:$src))),
	(VCVTTSS2SIZrr_Int VR128X:$src)>;
	def : Pat<(i32 (int_x86_sse_cvttss2si sse_load_f32:$src)),
	(VCVTTSS2SIZrm_Int ssmem:$src)>;
	def : Pat<(i64 (int_x86_sse_cvttss2si64 (v4f32 VR128X:$src))),
	(VCVTTSS2SI64Zrr_Int VR128X:$src)>;
	def : Pat<(i64 (int_x86_sse_cvttss2si64 sse_load_f32:$src)),
	(VCVTTSS2SI64Zrm_Int ssmem:$src)>;
	def : Pat<(i32 (int_x86_sse2_cvttsd2si (v2f64 VR128X:$src))),
	(VCVTTSD2SIZrr_Int VR128X:$src)>;
	def : Pat<(i32 (int_x86_sse2_cvttsd2si sse_load_f64:$src)),
	(VCVTTSD2SIZrm_Int sdmem:$src)>;
	def : Pat<(i64 (int_x86_sse2_cvttsd2si64 (v2f64 VR128X:$src))),
	(VCVTTSD2SI64Zrr_Int VR128X:$src)>;
	def : Pat<(i64 (int_x86_sse2_cvttsd2si64 sse_load_f64:$src)),
	(VCVTTSD2SI64Zrm_Int sdmem:$src)>;
	} // HasAVX512
	//===----------------------------------------------------------------------===//
	// AVX-512 Convert form float to double and back
	//===----------------------------------------------------------------------===//
	multiclass avx512_cvt_fp_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
	X86VectorVTInfo _Src, SDNode OpNode> {
	defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
	(ins _.RC:$src1, _Src.RC:$src2), OpcodeStr,
	"$src2, $src1", "$src1, $src2",
	(_.VT (OpNode (_.VT _.RC:$src1),
	(_Src.VT _Src.RC:$src2),
	(i32 FROUND_CURRENT)))>,
	EVEX_4V, VEX_LIG, Sched<[WriteCvtF2F]>;
	defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
	(ins _.RC:$src1, _Src.IntScalarMemOp:$src2), OpcodeStr,
	"$src2, $src1", "$src1, $src2",
	(_.VT (OpNode (_.VT _.RC:$src1),
	(_Src.VT _Src.ScalarIntMemCPat:$src2),
	(i32 FROUND_CURRENT)))>,
	EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>;

	let isCodeGenOnly = 1, hasSideEffects = 0 in {
	def rr : I<opc, MRMSrcReg, (outs _.FRC:$dst),
	(ins _.FRC:$src1, _Src.FRC:$src2),
	OpcodeStr#"\t{$src2, $src1, $dst\|$dst, $src1, $src2}", []>,
	EVEX_4V, VEX_LIG, Sched<[WriteCvtF2F]>;
	let mayLoad = 1 in
	def rm : I<opc, MRMSrcMem, (outs _.FRC:$dst),
	(ins _.FRC:$src1, _Src.ScalarMemOp:$src2),
	OpcodeStr#"\t{$src2, $src1, $dst\|$dst, $src1, $src2}", []>,
	EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>;
	}
	}

	// Scalar Coversion with SAE - suppress all exceptions
	multiclass avx512_cvt_fp_sae_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
	X86VectorVTInfo _Src, SDNode OpNodeRnd> {
	defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
	(ins _.RC:$src1, _Src.RC:$src2), OpcodeStr,
	"{sae}, $src2, $src1", "$src1, $src2, {sae}",
	(_.VT (OpNodeRnd (_.VT _.RC:$src1),
	(_Src.VT _Src.RC:$src2),
	(i32 FROUND_NO_EXC)))>,
	EVEX_4V, VEX_LIG, EVEX_B;
	}

	// Scalar Conversion with rounding control (RC)
	multiclass avx512_cvt_fp_rc_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
	X86VectorVTInfo _Src, SDNode OpNodeRnd> {
	defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
	(ins _.RC:$src1, _Src.RC:$src2, AVX512RC:$rc), OpcodeStr,
	"$rc, $src2, $src1", "$src1, $src2, $rc",
	(_.VT (OpNodeRnd (_.VT _.RC:$src1),
	(_Src.VT _Src.RC:$src2), (i32 imm:$rc)))>,
	EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>,
	EVEX_B, EVEX_RC;
	}
	multiclass avx512_cvt_fp_scalar_sd2ss<bits<8> opc, string OpcodeStr,
	SDNode OpNodeRnd, X86VectorVTInfo _src,
	X86VectorVTInfo _dst> {
	let Predicates = [HasAVX512] in {
	defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd>,
	avx512_cvt_fp_rc_scalar<opc, OpcodeStr, _dst, _src,
	OpNodeRnd>, VEX_W, EVEX_CD8<64, CD8VT1>, XD;
	}
	}

	multiclass avx512_cvt_fp_scalar_ss2sd<bits<8> opc, string OpcodeStr,
	SDNode OpNodeRnd, X86VectorVTInfo _src,
	X86VectorVTInfo _dst> {
	let Predicates = [HasAVX512] in {
	defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd>,
	avx512_cvt_fp_sae_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd>,
	EVEX_CD8<32, CD8VT1>, XS;
	}
	}
	defm VCVTSD2SS : avx512_cvt_fp_scalar_sd2ss<0x5A, "vcvtsd2ss",
	X86froundRnd, f64x_info, f32x_info>;
	defm VCVTSS2SD : avx512_cvt_fp_scalar_ss2sd<0x5A, "vcvtss2sd",
	X86fpextRnd,f32x_info, f64x_info >;

	def : Pat<(f64 (fpextend FR32X:$src)),
	(VCVTSS2SDZrr (COPY_TO_REGCLASS FR32X:$src, FR64X), FR32X:$src)>,
	Requires<[HasAVX512]>;
	def : Pat<(f64 (fpextend (loadf32 addr:$src))),
	(VCVTSS2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>,
	Requires<[HasAVX512]>;

	def : Pat<(f64 (extloadf32 addr:$src)),
	(VCVTSS2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>,
	Requires<[HasAVX512, OptForSize]>;

	def : Pat<(f64 (extloadf32 addr:$src)),
	(VCVTSS2SDZrr (f64 (IMPLICIT_DEF)), (VMOVSSZrm addr:$src))>,
	Requires<[HasAVX512, OptForSpeed]>;

	def : Pat<(f32 (fpround FR64X:$src)),
	(VCVTSD2SSZrr (COPY_TO_REGCLASS FR64X:$src, FR32X), FR64X:$src)>,
	Requires<[HasAVX512]>;

	def : Pat<(v4f32 (X86Movss
	(v4f32 VR128X:$dst),
	(v4f32 (scalar_to_vector
	(f32 (fpround (f64 (extractelt VR128X:$src, (iPTR 0))))))))),
	(VCVTSD2SSZrr_Int VR128X:$dst, VR128X:$src)>,
	Requires<[HasAVX512]>;

	def : Pat<(v2f64 (X86Movsd
	(v2f64 VR128X:$dst),
	(v2f64 (scalar_to_vector
	(f64 (fpextend (f32 (extractelt VR128X:$src, (iPTR 0))))))))),
	(VCVTSS2SDZrr_Int VR128X:$dst, VR128X:$src)>,
	Requires<[HasAVX512]>;

	//===----------------------------------------------------------------------===//
	// AVX-512 Vector convert from signed/unsigned integer to float/double
	// and from float/double to signed/unsigned integer
	//===----------------------------------------------------------------------===//

	multiclass avx512_vcvt_fp<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
	X86VectorVTInfo _Src, SDNode OpNode,
	string Broadcast = _.BroadcastStr,
	string Alias = "", X86MemOperand MemOp = _Src.MemOp> {

	defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
	(ins _Src.RC:$src), OpcodeStr, "$src", "$src",
	(_.VT (OpNode (_Src.VT _Src.RC:$src)))>, EVEX;

	defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
	(ins MemOp:$src), OpcodeStr#Alias, "$src", "$src",
	(_.VT (OpNode (_Src.VT
	(bitconvert (_Src.LdFrag addr:$src)))))>, EVEX;

	defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
	(ins _Src.ScalarMemOp:$src), OpcodeStr,
	"${src}"##Broadcast, "${src}"##Broadcast,
	(_.VT (OpNode (_Src.VT
	(X86VBroadcast (_Src.ScalarLdFrag addr:$src)))
	))>, EVEX, EVEX_B;
	}
	// Coversion with SAE - suppress all exceptions
	multiclass avx512_vcvt_fp_sae<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
	X86VectorVTInfo _Src, SDNode OpNodeRnd> {
	defm rrb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
	(ins _Src.RC:$src), OpcodeStr,
	"{sae}, $src", "$src, {sae}",
	(_.VT (OpNodeRnd (_Src.VT _Src.RC:$src),
	(i32 FROUND_NO_EXC)))>,
	EVEX, EVEX_B;
	}

	// Conversion with rounding control (RC)
	multiclass avx512_vcvt_fp_rc<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
	X86VectorVTInfo _Src, SDNode OpNodeRnd> {
	defm rrb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
	(ins _Src.RC:$src, AVX512RC:$rc), OpcodeStr,
	"$rc, $src", "$src, $rc",
	(_.VT (OpNodeRnd (_Src.VT _Src.RC:$src), (i32 imm:$rc)))>,
	EVEX, EVEX_B, EVEX_RC;
	}

	// Extend Float to Double
	multiclass avx512_cvtps2pd<bits<8> opc, string OpcodeStr> {
	let Predicates = [HasAVX512] in {
	defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8f32x_info, fpextend>,
	avx512_vcvt_fp_sae<opc, OpcodeStr, v8f64_info, v8f32x_info,
	X86vfpextRnd>, EVEX_V512;
	}
	let Predicates = [HasVLX] in {
	defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v4f32x_info,
	X86vfpext, "{1to2}", "", f64mem>, EVEX_V128;
	defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4f32x_info, fpextend>,
	EVEX_V256;
	}
	}

	// Truncate Double to Float
	multiclass avx512_cvtpd2ps<bits<8> opc, string OpcodeStr> {
	let Predicates = [HasAVX512] in {
	defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8f64_info, fpround>,
	avx512_vcvt_fp_rc<opc, OpcodeStr, v8f32x_info, v8f64_info,
	X86vfproundRnd>, EVEX_V512;
	}
	let Predicates = [HasVLX] in {
	defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v2f64x_info,
	X86vfpround, "{1to2}", "{x}">, EVEX_V128;
	defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4f64x_info, fpround,
	"{1to4}", "{y}">, EVEX_V256;

	def : InstAlias<OpcodeStr##"x\t{$src, $dst\|$dst, $src}",
	(!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0>;
	def : InstAlias<OpcodeStr##"x\t{$src, $dst\|$dst, $src}",
	(!cast<Instruction>(NAME # "Z128rm") VR128X:$dst, f128mem:$src), 0>;
	def : InstAlias<OpcodeStr##"y\t{$src, $dst\|$dst, $src}",
	(!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0>;
	def : InstAlias<OpcodeStr##"y\t{$src, $dst\|$dst, $src}",
	(!cast<Instruction>(NAME # "Z256rm") VR128X:$dst, f256mem:$src), 0>;
	}
	}

	defm VCVTPD2PS : avx512_cvtpd2ps<0x5A, "vcvtpd2ps">,
	VEX_W, PD, EVEX_CD8<64, CD8VF>;
	defm VCVTPS2PD : avx512_cvtps2pd<0x5A, "vcvtps2pd">,
	PS, EVEX_CD8<32, CD8VH>;

	def : Pat<(v8f64 (extloadv8f32 addr:$src)),
	(VCVTPS2PDZrm addr:$src)>;

	let Predicates = [HasVLX] in {
	let AddedComplexity = 15 in
	def : Pat<(X86vzmovl (v2f64 (bitconvert
	(v4f32 (X86vfpround (v2f64 VR128X:$src)))))),
	(VCVTPD2PSZ128rr VR128X:$src)>;
	def : Pat<(v2f64 (extloadv2f32 addr:$src)),
	(VCVTPS2PDZ128rm addr:$src)>;
	def : Pat<(v4f64 (extloadv4f32 addr:$src)),
	(VCVTPS2PDZ256rm addr:$src)>;
	}

	// Convert Signed/Unsigned Doubleword to Double
	multiclass avx512_cvtdq2pd<bits<8> opc, string OpcodeStr, SDNode OpNode,
	SDNode OpNode128> {
	// No rounding in this op
	let Predicates = [HasAVX512] in
	defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8i32x_info, OpNode>,
	EVEX_V512;

	let Predicates = [HasVLX] in {
	defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v4i32x_info,
	OpNode128, "{1to2}", "", i64mem>, EVEX_V128;
	defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4i32x_info, OpNode>,
	EVEX_V256;
	}
	}

	// Convert Signed/Unsigned Doubleword to Float
	multiclass avx512_cvtdq2ps<bits<8> opc, string OpcodeStr, SDNode OpNode,
	SDNode OpNodeRnd> {
	let Predicates = [HasAVX512] in
	defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16f32_info, v16i32_info, OpNode>,
	avx512_vcvt_fp_rc<opc, OpcodeStr, v16f32_info, v16i32_info,
	OpNodeRnd>, EVEX_V512;

	let Predicates = [HasVLX] in {
	defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4i32x_info, OpNode>,
	EVEX_V128;
	defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8i32x_info, OpNode>,
	EVEX_V256;
	}
	}

	// Convert Float to Signed/Unsigned Doubleword with truncation
	multiclass avx512_cvttps2dq<bits<8> opc, string OpcodeStr,
	SDNode OpNode, SDNode OpNodeRnd> {
	let Predicates = [HasAVX512] in {
	defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i32_info, v16f32_info, OpNode>,
	avx512_vcvt_fp_sae<opc, OpcodeStr, v16i32_info, v16f32_info,
	OpNodeRnd>, EVEX_V512;
	}
	let Predicates = [HasVLX] in {
	defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f32x_info, OpNode>,
	EVEX_V128;
	defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f32x_info, OpNode>,
	EVEX_V256;
	}
	}

	// Convert Float to Signed/Unsigned Doubleword
	multiclass avx512_cvtps2dq<bits<8> opc, string OpcodeStr,
	SDNode OpNode, SDNode OpNodeRnd> {
	let Predicates = [HasAVX512] in {
	defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i32_info, v16f32_info, OpNode>,
	avx512_vcvt_fp_rc<opc, OpcodeStr, v16i32_info, v16f32_info,
	OpNodeRnd>, EVEX_V512;
	}
	let Predicates = [HasVLX] in {
	defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f32x_info, OpNode>,
	EVEX_V128;
	defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f32x_info, OpNode>,
	EVEX_V256;
	}
	}

	// Convert Double to Signed/Unsigned Doubleword with truncation
	multiclass avx512_cvttpd2dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
	SDNode OpNode128, SDNode OpNodeRnd> {
	let Predicates = [HasAVX512] in {
	defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f64_info, OpNode>,
	avx512_vcvt_fp_sae<opc, OpcodeStr, v8i32x_info, v8f64_info,
	OpNodeRnd>, EVEX_V512;
	}
	let Predicates = [HasVLX] in {
	// we need "x"/"y" suffixes in order to distinguish between 128 and 256
	// memory forms of these instructions in Asm Parser. They have the same
	// dest type - 'v4i32x_info'. We also specify the broadcast string explicitly
	// due to the same reason.
	defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v2f64x_info,
	OpNode128, "{1to2}", "{x}">, EVEX_V128;
	defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f64x_info, OpNode,
	"{1to4}", "{y}">, EVEX_V256;

	def : InstAlias<OpcodeStr##"x\t{$src, $dst\|$dst, $src}",
	(!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0>;
	def : InstAlias<OpcodeStr##"x\t{$src, $dst\|$dst, $src}",
	(!cast<Instruction>(NAME # "Z128rm") VR128X:$dst, i128mem:$src), 0>;
	def : InstAlias<OpcodeStr##"y\t{$src, $dst\|$dst, $src}",
	(!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0>;
	def : InstAlias<OpcodeStr##"y\t{$src, $dst\|$dst, $src}",
	(!cast<Instruction>(NAME # "Z256rm") VR128X:$dst, i256mem:$src), 0>;
	}
	}

	// Convert Double to Signed/Unsigned Doubleword
	multiclass avx512_cvtpd2dq<bits<8> opc, string OpcodeStr,
	SDNode OpNode, SDNode OpNodeRnd> {
	let Predicates = [HasAVX512] in {
	defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f64_info, OpNode>,
	avx512_vcvt_fp_rc<opc, OpcodeStr, v8i32x_info, v8f64_info,
	OpNodeRnd>, EVEX_V512;
	}
	let Predicates = [HasVLX] in {
	// we need "x"/"y" suffixes in order to distinguish between 128 and 256
	// memory forms of these instructions in Asm Parcer. They have the same
	// dest type - 'v4i32x_info'. We also specify the broadcast string explicitly
	// due to the same reason.
	defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v2f64x_info, OpNode,
	"{1to2}", "{x}">, EVEX_V128;
	defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f64x_info, OpNode,
	"{1to4}", "{y}">, EVEX_V256;

	def : InstAlias<OpcodeStr##"x\t{$src, $dst\|$dst, $src}",
	(!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0>;
	def : InstAlias<OpcodeStr##"x\t{$src, $dst\|$dst, $src}",
	(!cast<Instruction>(NAME # "Z128rm") VR128X:$dst, f128mem:$src), 0>;
	def : InstAlias<OpcodeStr##"y\t{$src, $dst\|$dst, $src}",
	(!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0>;
	def : InstAlias<OpcodeStr##"y\t{$src, $dst\|$dst, $src}",
	(!cast<Instruction>(NAME # "Z256rm") VR128X:$dst, f256mem:$src), 0>;
	}
	}

	// Convert Double to Signed/Unsigned Quardword
	multiclass avx512_cvtpd2qq<bits<8> opc, string OpcodeStr,
	SDNode OpNode, SDNode OpNodeRnd> {
	let Predicates = [HasDQI] in {
	defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f64_info, OpNode>,
	avx512_vcvt_fp_rc<opc, OpcodeStr, v8i64_info, v8f64_info,
	OpNodeRnd>, EVEX_V512;
	}
	let Predicates = [HasDQI, HasVLX] in {
	defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v2f64x_info, OpNode>,
	EVEX_V128;
	defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f64x_info, OpNode>,
	EVEX_V256;
	}
	}

	// Convert Double to Signed/Unsigned Quardword with truncation
	multiclass avx512_cvttpd2qq<bits<8> opc, string OpcodeStr,
	SDNode OpNode, SDNode OpNodeRnd> {
	let Predicates = [HasDQI] in {
	defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f64_info, OpNode>,
	avx512_vcvt_fp_sae<opc, OpcodeStr, v8i64_info, v8f64_info,
	OpNodeRnd>, EVEX_V512;
	}
	let Predicates = [HasDQI, HasVLX] in {
	defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v2f64x_info, OpNode>,
	EVEX_V128;
	defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f64x_info, OpNode>,
	EVEX_V256;
	}
	}

	// Convert Signed/Unsigned Quardword to Double
	multiclass avx512_cvtqq2pd<bits<8> opc, string OpcodeStr,
	SDNode OpNode, SDNode OpNodeRnd> {
	let Predicates = [HasDQI] in {
	defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8i64_info, OpNode>,
	avx512_vcvt_fp_rc<opc, OpcodeStr, v8f64_info, v8i64_info,
	OpNodeRnd>, EVEX_V512;
	}
	let Predicates = [HasDQI, HasVLX] in {
	defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v2i64x_info, OpNode>,
	EVEX_V128;
	defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4i64x_info, OpNode>,
	EVEX_V256;
	}
	}

	// Convert Float to Signed/Unsigned Quardword
	multiclass avx512_cvtps2qq<bits<8> opc, string OpcodeStr,
	SDNode OpNode, SDNode OpNodeRnd> {
	let Predicates = [HasDQI] in {
	defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f32x_info, OpNode>,
	avx512_vcvt_fp_rc<opc, OpcodeStr, v8i64_info, v8f32x_info,
	OpNodeRnd>, EVEX_V512;
	}
	let Predicates = [HasDQI, HasVLX] in {
	// Explicitly specified broadcast string, since we take only 2 elements
	// from v4f32x_info source
	defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v4f32x_info, OpNode,
	"{1to2}", "", f64mem>, EVEX_V128;
	defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f32x_info, OpNode>,
	EVEX_V256;
	}
	}

	// Convert Float to Signed/Unsigned Quardword with truncation
	multiclass avx512_cvttps2qq<bits<8> opc, string OpcodeStr, SDNode OpNode,
	SDNode OpNode128, SDNode OpNodeRnd> {
	let Predicates = [HasDQI] in {
	defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f32x_info, OpNode>,
	avx512_vcvt_fp_sae<opc, OpcodeStr, v8i64_info, v8f32x_info,
	OpNodeRnd>, EVEX_V512;
	}
	let Predicates = [HasDQI, HasVLX] in {
	// Explicitly specified broadcast string, since we take only 2 elements
	// from v4f32x_info source
	defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v4f32x_info, OpNode128,
	"{1to2}", "", f64mem>, EVEX_V128;
	defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f32x_info, OpNode>,
	EVEX_V256;
	}
	}

	// Convert Signed/Unsigned Quardword to Float
	multiclass avx512_cvtqq2ps<bits<8> opc, string OpcodeStr, SDNode OpNode,
	SDNode OpNode128, SDNode OpNodeRnd> {
	let Predicates = [HasDQI] in {
	defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8i64_info, OpNode>,
	avx512_vcvt_fp_rc<opc, OpcodeStr, v8f32x_info, v8i64_info,
	OpNodeRnd>, EVEX_V512;
	}
	let Predicates = [HasDQI, HasVLX] in {
	// we need "x"/"y" suffixes in order to distinguish between 128 and 256
	// memory forms of these instructions in Asm Parcer. They have the same
	// dest type - 'v4i32x_info'. We also specify the broadcast string explicitly
	// due to the same reason.
	defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v2i64x_info, OpNode128,
	"{1to2}", "{x}">, EVEX_V128;
	defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4i64x_info, OpNode,
	"{1to4}", "{y}">, EVEX_V256;

	def : InstAlias<OpcodeStr##"x\t{$src, $dst\|$dst, $src}",
	(!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0>;
	def : InstAlias<OpcodeStr##"x\t{$src, $dst\|$dst, $src}",
	(!cast<Instruction>(NAME # "Z128rm") VR128X:$dst, i128mem:$src), 0>;
	def : InstAlias<OpcodeStr##"y\t{$src, $dst\|$dst, $src}",
	(!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0>;
	def : InstAlias<OpcodeStr##"y\t{$src, $dst\|$dst, $src}",
	(!cast<Instruction>(NAME # "Z256rm") VR128X:$dst, i256mem:$src), 0>;
	}
	}

	defm VCVTDQ2PD : avx512_cvtdq2pd<0xE6, "vcvtdq2pd", sint_to_fp, X86VSintToFP>,
	XS, EVEX_CD8<32, CD8VH>;

	defm VCVTDQ2PS : avx512_cvtdq2ps<0x5B, "vcvtdq2ps", sint_to_fp,
	X86VSintToFpRnd>,
	PS, EVEX_CD8<32, CD8VF>;

	defm VCVTTPS2DQ : avx512_cvttps2dq<0x5B, "vcvttps2dq", fp_to_sint,
	X86cvttp2siRnd>,
	XS, EVEX_CD8<32, CD8VF>;

	defm VCVTTPD2DQ : avx512_cvttpd2dq<0xE6, "vcvttpd2dq", fp_to_sint, X86cvttp2si,
	X86cvttp2siRnd>,
	PD, VEX_W, EVEX_CD8<64, CD8VF>;

	defm VCVTTPS2UDQ : avx512_cvttps2dq<0x78, "vcvttps2udq", fp_to_uint,
	X86cvttp2uiRnd>, PS,
	EVEX_CD8<32, CD8VF>;

	defm VCVTTPD2UDQ : avx512_cvttpd2dq<0x78, "vcvttpd2udq", fp_to_uint,
	X86cvttp2ui, X86cvttp2uiRnd>, PS, VEX_W,
	EVEX_CD8<64, CD8VF>;

	defm VCVTUDQ2PD : avx512_cvtdq2pd<0x7A, "vcvtudq2pd", uint_to_fp, X86VUintToFP>,
	XS, EVEX_CD8<32, CD8VH>;

	defm VCVTUDQ2PS : avx512_cvtdq2ps<0x7A, "vcvtudq2ps", uint_to_fp,
	X86VUintToFpRnd>, XD,
	EVEX_CD8<32, CD8VF>;

	defm VCVTPS2DQ : avx512_cvtps2dq<0x5B, "vcvtps2dq", X86cvtp2Int,
	X86cvtp2IntRnd>, PD, EVEX_CD8<32, CD8VF>;

	defm VCVTPD2DQ : avx512_cvtpd2dq<0xE6, "vcvtpd2dq", X86cvtp2Int,
	X86cvtp2IntRnd>, XD, VEX_W,
	EVEX_CD8<64, CD8VF>;

	defm VCVTPS2UDQ : avx512_cvtps2dq<0x79, "vcvtps2udq", X86cvtp2UInt,
	X86cvtp2UIntRnd>,
	PS, EVEX_CD8<32, CD8VF>;
	defm VCVTPD2UDQ : avx512_cvtpd2dq<0x79, "vcvtpd2udq", X86cvtp2UInt,
	X86cvtp2UIntRnd>, VEX_W,
	PS, EVEX_CD8<64, CD8VF>;

	defm VCVTPD2QQ : avx512_cvtpd2qq<0x7B, "vcvtpd2qq", X86cvtp2Int,
	X86cvtp2IntRnd>, VEX_W,
	PD, EVEX_CD8<64, CD8VF>;

	defm VCVTPS2QQ : avx512_cvtps2qq<0x7B, "vcvtps2qq", X86cvtp2Int,
	X86cvtp2IntRnd>, PD, EVEX_CD8<32, CD8VH>;

	defm VCVTPD2UQQ : avx512_cvtpd2qq<0x79, "vcvtpd2uqq", X86cvtp2UInt,
	X86cvtp2UIntRnd>, VEX_W,
	PD, EVEX_CD8<64, CD8VF>;

	defm VCVTPS2UQQ : avx512_cvtps2qq<0x79, "vcvtps2uqq", X86cvtp2UInt,
	X86cvtp2UIntRnd>, PD, EVEX_CD8<32, CD8VH>;

	defm VCVTTPD2QQ : avx512_cvttpd2qq<0x7A, "vcvttpd2qq", fp_to_sint,
	X86cvttp2siRnd>, VEX_W,
	PD, EVEX_CD8<64, CD8VF>;

	defm VCVTTPS2QQ : avx512_cvttps2qq<0x7A, "vcvttps2qq", fp_to_sint, X86cvttp2si,
	X86cvttp2siRnd>, PD, EVEX_CD8<32, CD8VH>;

	defm VCVTTPD2UQQ : avx512_cvttpd2qq<0x78, "vcvttpd2uqq", fp_to_uint,
	X86cvttp2uiRnd>, VEX_W,
	PD, EVEX_CD8<64, CD8VF>;

	defm VCVTTPS2UQQ : avx512_cvttps2qq<0x78, "vcvttps2uqq", fp_to_uint, X86cvttp2ui,
	X86cvttp2uiRnd>, PD, EVEX_CD8<32, CD8VH>;

	defm VCVTQQ2PD : avx512_cvtqq2pd<0xE6, "vcvtqq2pd", sint_to_fp,
	X86VSintToFpRnd>, VEX_W, XS, EVEX_CD8<64, CD8VF>;

	defm VCVTUQQ2PD : avx512_cvtqq2pd<0x7A, "vcvtuqq2pd", uint_to_fp,
	X86VUintToFpRnd>, VEX_W, XS, EVEX_CD8<64, CD8VF>;

	defm VCVTQQ2PS : avx512_cvtqq2ps<0x5B, "vcvtqq2ps", sint_to_fp, X86VSintToFP,
	X86VSintToFpRnd>, VEX_W, PS, EVEX_CD8<64, CD8VF>;

	defm VCVTUQQ2PS : avx512_cvtqq2ps<0x7A, "vcvtuqq2ps", uint_to_fp, X86VUintToFP,
	X86VUintToFpRnd>, VEX_W, XD, EVEX_CD8<64, CD8VF>;

	let Predicates = [HasAVX512, NoVLX] in {
	def : Pat<(v8i32 (fp_to_uint (v8f32 VR256X:$src1))),
	(EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr
	(v16f32 (INSERT_SUBREG (IMPLICIT_DEF),
	VR256X:$src1, sub_ymm)))), sub_ymm)>;

	def : Pat<(v4i32 (fp_to_uint (v4f32 VR128X:$src1))),
	(EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr
	(v16f32 (INSERT_SUBREG (IMPLICIT_DEF),
	VR128X:$src1, sub_xmm)))), sub_xmm)>;

	def : Pat<(v4i32 (fp_to_uint (v4f64 VR256X:$src1))),
	(EXTRACT_SUBREG (v8i32 (VCVTTPD2UDQZrr
	(v8f64 (INSERT_SUBREG (IMPLICIT_DEF),
	VR256X:$src1, sub_ymm)))), sub_xmm)>;

	def : Pat<(v4i32 (X86cvttp2ui (v2f64 VR128X:$src))),
	(EXTRACT_SUBREG (v8i32 (VCVTTPD2UDQZrr
	(v8f64 (INSERT_SUBREG (IMPLICIT_DEF),
	VR128X:$src, sub_xmm)))), sub_xmm)>;

	def : Pat<(v8f32 (uint_to_fp (v8i32 VR256X:$src1))),
	(EXTRACT_SUBREG (v16f32 (VCVTUDQ2PSZrr
	(v16i32 (INSERT_SUBREG (IMPLICIT_DEF),
	VR256X:$src1, sub_ymm)))), sub_ymm)>;

	def : Pat<(v4f32 (uint_to_fp (v4i32 VR128X:$src1))),
	(EXTRACT_SUBREG (v16f32 (VCVTUDQ2PSZrr
	(v16i32 (INSERT_SUBREG (IMPLICIT_DEF),
	VR128X:$src1, sub_xmm)))), sub_xmm)>;

	def : Pat<(v4f64 (uint_to_fp (v4i32 VR128X:$src1))),
	(EXTRACT_SUBREG (v8f64 (VCVTUDQ2PDZrr
	(v8i32 (INSERT_SUBREG (IMPLICIT_DEF),
	VR128X:$src1, sub_xmm)))), sub_ymm)>;

	def : Pat<(v2f64 (X86VUintToFP (v4i32 VR128X:$src1))),
	(EXTRACT_SUBREG (v8f64 (VCVTUDQ2PDZrr
	(v8i32 (INSERT_SUBREG (IMPLICIT_DEF),
	VR128X:$src1, sub_xmm)))), sub_xmm)>;
	}

	let Predicates = [HasAVX512, HasVLX] in {
	let AddedComplexity = 15 in {
	def : Pat<(X86vzmovl (v2i64 (bitconvert
	(v4i32 (X86cvtp2Int (v2f64 VR128X:$src)))))),
	(VCVTPD2DQZ128rr VR128X:$src)>;
	def : Pat<(v4i32 (bitconvert (X86vzmovl (v2i64 (bitconvert
	(v4i32 (X86cvtp2UInt (v2f64 VR128X:$src)))))))),
	(VCVTPD2UDQZ128rr VR128X:$src)>;
	def : Pat<(X86vzmovl (v2i64 (bitconvert
	(v4i32 (X86cvttp2si (v2f64 VR128X:$src)))))),
	(VCVTTPD2DQZ128rr VR128X:$src)>;
	def : Pat<(v4i32 (bitconvert (X86vzmovl (v2i64 (bitconvert
	(v4i32 (X86cvttp2ui (v2f64 VR128X:$src)))))))),
	(VCVTTPD2UDQZ128rr VR128X:$src)>;
	}
	}

	let Predicates = [HasAVX512] in {
	def : Pat<(v8f32 (fpround (loadv8f64 addr:$src))),
	(VCVTPD2PSZrm addr:$src)>;
	def : Pat<(v8f64 (extloadv8f32 addr:$src)),
	(VCVTPS2PDZrm addr:$src)>;
	}

	let Predicates = [HasDQI, HasVLX] in {
	let AddedComplexity = 15 in {
	def : Pat<(X86vzmovl (v2f64 (bitconvert
	(v4f32 (X86VSintToFP (v2i64 VR128X:$src)))))),
	(VCVTQQ2PSZ128rr VR128X:$src)>;
	def : Pat<(X86vzmovl (v2f64 (bitconvert
	(v4f32 (X86VUintToFP (v2i64 VR128X:$src)))))),
	(VCVTUQQ2PSZ128rr VR128X:$src)>;
	}
	}

	let Predicates = [HasDQI, NoVLX] in {
	def : Pat<(v2i64 (fp_to_sint (v2f64 VR128X:$src1))),
	(EXTRACT_SUBREG (v8i64 (VCVTTPD2QQZrr
	(v8f64 (INSERT_SUBREG (IMPLICIT_DEF),
	VR128X:$src1, sub_xmm)))), sub_xmm)>;

	def : Pat<(v4i64 (fp_to_sint (v4f32 VR128X:$src1))),
	(EXTRACT_SUBREG (v8i64 (VCVTTPS2QQZrr
	(v8f32 (INSERT_SUBREG (IMPLICIT_DEF),
	VR128X:$src1, sub_xmm)))), sub_ymm)>;

	def : Pat<(v4i64 (fp_to_sint (v4f64 VR256X:$src1))),
	(EXTRACT_SUBREG (v8i64 (VCVTTPD2QQZrr
	(v8f64 (INSERT_SUBREG (IMPLICIT_DEF),
	VR256X:$src1, sub_ymm)))), sub_ymm)>;

	def : Pat<(v2i64 (fp_to_uint (v2f64 VR128X:$src1))),
	(EXTRACT_SUBREG (v8i64 (VCVTTPD2UQQZrr
	(v8f64 (INSERT_SUBREG (IMPLICIT_DEF),
	VR128X:$src1, sub_xmm)))), sub_xmm)>;

	def : Pat<(v4i64 (fp_to_uint (v4f32 VR128X:$src1))),
	(EXTRACT_SUBREG (v8i64 (VCVTTPS2UQQZrr
	(v8f32 (INSERT_SUBREG (IMPLICIT_DEF),
	VR128X:$src1, sub_xmm)))), sub_ymm)>;

	def : Pat<(v4i64 (fp_to_uint (v4f64 VR256X:$src1))),
	(EXTRACT_SUBREG (v8i64 (VCVTTPD2UQQZrr
	(v8f64 (INSERT_SUBREG (IMPLICIT_DEF),
	VR256X:$src1, sub_ymm)))), sub_ymm)>;

	def : Pat<(v4f32 (sint_to_fp (v4i64 VR256X:$src1))),
	(EXTRACT_SUBREG (v8f32 (VCVTQQ2PSZrr
	(v8i64 (INSERT_SUBREG (IMPLICIT_DEF),
	VR256X:$src1, sub_ymm)))), sub_xmm)>;

	def : Pat<(v2f64 (sint_to_fp (v2i64 VR128X:$src1))),
	(EXTRACT_SUBREG (v8f64 (VCVTQQ2PDZrr
	(v8i64 (INSERT_SUBREG (IMPLICIT_DEF),
	VR128X:$src1, sub_xmm)))), sub_xmm)>;

	def : Pat<(v4f64 (sint_to_fp (v4i64 VR256X:$src1))),
	(EXTRACT_SUBREG (v8f64 (VCVTQQ2PDZrr
	(v8i64 (INSERT_SUBREG (IMPLICIT_DEF),
	VR256X:$src1, sub_ymm)))), sub_ymm)>;

	def : Pat<(v4f32 (uint_to_fp (v4i64 VR256X:$src1))),
	(EXTRACT_SUBREG (v8f32 (VCVTUQQ2PSZrr
	(v8i64 (INSERT_SUBREG (IMPLICIT_DEF),
	VR256X:$src1, sub_ymm)))), sub_xmm)>;

	def : Pat<(v2f64 (uint_to_fp (v2i64 VR128X:$src1))),
	(EXTRACT_SUBREG (v8f64 (VCVTUQQ2PDZrr
	(v8i64 (INSERT_SUBREG (IMPLICIT_DEF),
	VR128X:$src1, sub_xmm)))), sub_xmm)>;

	def : Pat<(v4f64 (uint_to_fp (v4i64 VR256X:$src1))),
	(EXTRACT_SUBREG (v8f64 (VCVTUQQ2PDZrr
	(v8i64 (INSERT_SUBREG (IMPLICIT_DEF),
	VR256X:$src1, sub_ymm)))), sub_ymm)>;
	}

	//===----------------------------------------------------------------------===//
	// Half precision conversion instructions
	//===----------------------------------------------------------------------===//
	multiclass avx512_cvtph2ps<X86VectorVTInfo _dest, X86VectorVTInfo _src,
	X86MemOperand x86memop, PatFrag ld_frag> {
	defm rr : AVX512_maskable<0x13, MRMSrcReg, _dest ,(outs _dest.RC:$dst), (ins _src.RC:$src),
	"vcvtph2ps", "$src", "$src",
	(X86cvtph2ps (_src.VT _src.RC:$src),
	(i32 FROUND_CURRENT))>, T8PD;
	defm rm : AVX512_maskable<0x13, MRMSrcMem, _dest, (outs _dest.RC:$dst), (ins x86memop:$src),
	"vcvtph2ps", "$src", "$src",
	(X86cvtph2ps (_src.VT (bitconvert (ld_frag addr:$src))),
	(i32 FROUND_CURRENT))>, T8PD;
	}

	multiclass avx512_cvtph2ps_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src> {
	defm rb : AVX512_maskable<0x13, MRMSrcReg, _dest ,(outs _dest.RC:$dst), (ins _src.RC:$src),
	"vcvtph2ps", "{sae}, $src", "$src, {sae}",
	(X86cvtph2ps (_src.VT _src.RC:$src),
	(i32 FROUND_NO_EXC))>, T8PD, EVEX_B;

	}

	let Predicates = [HasAVX512] in {
	defm VCVTPH2PSZ : avx512_cvtph2ps<v16f32_info, v16i16x_info, f256mem, loadv4i64>,
	avx512_cvtph2ps_sae<v16f32_info, v16i16x_info>,
	EVEX, EVEX_V512, EVEX_CD8<32, CD8VH>;
	let Predicates = [HasVLX] in {
	defm VCVTPH2PSZ256 : avx512_cvtph2ps<v8f32x_info, v8i16x_info, f128mem,
	loadv2i64>,EVEX, EVEX_V256, EVEX_CD8<32, CD8VH>;
	defm VCVTPH2PSZ128 : avx512_cvtph2ps<v4f32x_info, v8i16x_info, f64mem,
	loadv2i64>, EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>;
	}
	}

	multiclass avx512_cvtps2ph<X86VectorVTInfo _dest, X86VectorVTInfo _src,
	X86MemOperand x86memop> {
	defm rr : AVX512_maskable<0x1D, MRMDestReg, _dest ,(outs _dest.RC:$dst),
	(ins _src.RC:$src1, i32u8imm:$src2),
	"vcvtps2ph", "$src2, $src1", "$src1, $src2",
	(X86cvtps2ph (_src.VT _src.RC:$src1),
	(i32 imm:$src2)),
	NoItinerary, 0, 0, X86select>, AVX512AIi8Base;
	def mr : AVX512AIi8<0x1D, MRMDestMem, (outs),
	(ins x86memop:$dst, _src.RC:$src1, i32u8imm:$src2),
	"vcvtps2ph\t{$src2, $src1, $dst\|$dst, $src1, $src2}",
	[(store (_dest.VT (X86cvtps2ph (_src.VT _src.RC:$src1),
	(i32 imm:$src2))),
	addr:$dst)]>;
	let hasSideEffects = 0, mayStore = 1 in
	def mrk : AVX512AIi8<0x1D, MRMDestMem, (outs),
	(ins x86memop:$dst, _dest.KRCWM:$mask, _src.RC:$src1, i32u8imm:$src2),
	"vcvtps2ph\t{$src2, $src1, $dst {${mask}}\|$dst {${mask}}, $src1, $src2}",
	[]>, EVEX_K;
	}
	multiclass avx512_cvtps2ph_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src> {
	let hasSideEffects = 0 in
	defm rb : AVX512_maskable_in_asm<0x1D, MRMDestReg, _dest,
	(outs _dest.RC:$dst),
	(ins _src.RC:$src1, i32u8imm:$src2),
	"vcvtps2ph", "$src2, {sae}, $src1", "$src1, {sae}, $src2",
	[]>, EVEX_B, AVX512AIi8Base;
	}
	let Predicates = [HasAVX512] in {
	defm VCVTPS2PHZ : avx512_cvtps2ph<v16i16x_info, v16f32_info, f256mem>,
	avx512_cvtps2ph_sae<v16i16x_info, v16f32_info>,
	EVEX, EVEX_V512, EVEX_CD8<32, CD8VH>;
	let Predicates = [HasVLX] in {
	defm VCVTPS2PHZ256 : avx512_cvtps2ph<v8i16x_info, v8f32x_info, f128mem>,
	EVEX, EVEX_V256, EVEX_CD8<32, CD8VH>;
	defm VCVTPS2PHZ128 : avx512_cvtps2ph<v8i16x_info, v4f32x_info, f64mem>,
	EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>;
	}
	}

	// Patterns for matching conversions from float to half-float and vice versa.
	let Predicates = [HasVLX] in {
	// Use MXCSR.RC for rounding instead of explicitly specifying the default
	// rounding mode (Nearest-Even, encoded as 0). Both are equivalent in the
	// configurations we support (the default). However, falling back to MXCSR is
	// more consistent with other instructions, which are always controlled by it.
	// It's encoded as 0b100.
	def : Pat<(fp_to_f16 FR32X:$src),
	(i16 (EXTRACT_SUBREG (VMOVPDI2DIZrr (VCVTPS2PHZ128rr
	(COPY_TO_REGCLASS FR32X:$src, VR128X), 4)), sub_16bit))>;

	def : Pat<(f16_to_fp GR16:$src),
	(f32 (COPY_TO_REGCLASS (VCVTPH2PSZ128rr
	(COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128X)), FR32X)) >;

	def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32X:$src))),
	(f32 (COPY_TO_REGCLASS (VCVTPH2PSZ128rr
	(VCVTPS2PHZ128rr (COPY_TO_REGCLASS FR32X:$src, VR128X), 4)), FR32X)) >;
	}

	// Patterns for matching float to half-float conversion when AVX512 is supported
	// but F16C isn't. In that case we have to use 512-bit vectors.
	let Predicates = [HasAVX512, NoVLX, NoF16C] in {
	def : Pat<(fp_to_f16 FR32X:$src),
	(i16 (EXTRACT_SUBREG
	(VMOVPDI2DIZrr
	(v8i16 (EXTRACT_SUBREG
	(VCVTPS2PHZrr
	(INSERT_SUBREG (v16f32 (IMPLICIT_DEF)),
	(v4f32 (COPY_TO_REGCLASS FR32X:$src, VR128X)),
	sub_xmm), 4), sub_xmm))), sub_16bit))>;

	def : Pat<(f16_to_fp GR16:$src),
	(f32 (COPY_TO_REGCLASS
	(v4f32 (EXTRACT_SUBREG
	(VCVTPH2PSZrr
	(INSERT_SUBREG (v16i16 (IMPLICIT_DEF)),
	(v8i16 (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128X)),
	sub_xmm)), sub_xmm)), FR32X))>;

	def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32X:$src))),
	(f32 (COPY_TO_REGCLASS
	(v4f32 (EXTRACT_SUBREG
	(VCVTPH2PSZrr
	(VCVTPS2PHZrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)),
	(v4f32 (COPY_TO_REGCLASS FR32X:$src, VR128X)),
	sub_xmm), 4)), sub_xmm)), FR32X))>;
	}

	// Unordered/Ordered scalar fp compare with Sea and set EFLAGS
	multiclass avx512_ord_cmp_sae<bits<8> opc, X86VectorVTInfo _,
	string OpcodeStr> {
	def rb: AVX512<opc, MRMSrcReg, (outs), (ins _.RC:$src1, _.RC:$src2),
	!strconcat(OpcodeStr, "\t{{sae}, $src2, $src1\|$src1, $src2, {sae}}"),
	[], IIC_SSE_COMIS_RR>, EVEX, EVEX_B, VEX_LIG, EVEX_V128,
	Sched<[WriteFAdd]>;
	}

	let Defs = [EFLAGS], Predicates = [HasAVX512] in {
	defm VUCOMISSZ : avx512_ord_cmp_sae<0x2E, v4f32x_info, "vucomiss">,
	AVX512PSIi8Base, EVEX_CD8<32, CD8VT1>;
	defm VUCOMISDZ : avx512_ord_cmp_sae<0x2E, v2f64x_info, "vucomisd">,
	AVX512PDIi8Base, VEX_W, EVEX_CD8<64, CD8VT1>;
	defm VCOMISSZ : avx512_ord_cmp_sae<0x2F, v4f32x_info, "vcomiss">,
	AVX512PSIi8Base, EVEX_CD8<32, CD8VT1>;
	defm VCOMISDZ : avx512_ord_cmp_sae<0x2F, v2f64x_info, "vcomisd">,
	AVX512PDIi8Base, VEX_W, EVEX_CD8<64, CD8VT1>;
	}

	let Defs = [EFLAGS], Predicates = [HasAVX512] in {
	defm VUCOMISSZ : sse12_ord_cmp<0x2E, FR32X, X86cmp, f32, f32mem, loadf32,
	"ucomiss">, PS, EVEX, VEX_LIG,
	EVEX_CD8<32, CD8VT1>;
	defm VUCOMISDZ : sse12_ord_cmp<0x2E, FR64X, X86cmp, f64, f64mem, loadf64,
	"ucomisd">, PD, EVEX,
	VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
	let Pattern = []<dag> in {
	defm VCOMISSZ : sse12_ord_cmp<0x2F, FR32X, undef, f32, f32mem, loadf32,
	"comiss">, PS, EVEX, VEX_LIG,
	EVEX_CD8<32, CD8VT1>;
	defm VCOMISDZ : sse12_ord_cmp<0x2F, FR64X, undef, f64, f64mem, loadf64,
	"comisd">, PD, EVEX,
	VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
	}
	let isCodeGenOnly = 1 in {
	defm Int_VUCOMISSZ : sse12_ord_cmp_int<0x2E, VR128X, X86ucomi, v4f32, ssmem,
	sse_load_f32, "ucomiss">, PS, EVEX, VEX_LIG,
	EVEX_CD8<32, CD8VT1>;
	defm Int_VUCOMISDZ : sse12_ord_cmp_int<0x2E, VR128X, X86ucomi, v2f64, sdmem,
	sse_load_f64, "ucomisd">, PD, EVEX,
	VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;

	defm Int_VCOMISSZ : sse12_ord_cmp_int<0x2F, VR128X, X86comi, v4f32, ssmem,
	sse_load_f32, "comiss">, PS, EVEX, VEX_LIG,
	EVEX_CD8<32, CD8VT1>;
	defm Int_VCOMISDZ : sse12_ord_cmp_int<0x2F, VR128X, X86comi, v2f64, sdmem,
	sse_load_f64, "comisd">, PD, EVEX,
	VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
	}
	}

	/// avx512_fp14_s rcp14ss, rcp14sd, rsqrt14ss, rsqrt14sd
	multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
	X86VectorVTInfo _> {
	let Predicates = [HasAVX512], ExeDomain = _.ExeDomain in {
	defm rr : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
	(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
	"$src2, $src1", "$src1, $src2",
	(OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))>, EVEX_4V;
	defm rm : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
	(ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
	"$src2, $src1", "$src1, $src2",
	(OpNode (_.VT _.RC:$src1),
	(_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))))>, EVEX_4V;
	}
	}

	defm VRCP14SS : avx512_fp14_s<0x4D, "vrcp14ss", X86frcp14s, f32x_info>,
	EVEX_CD8<32, CD8VT1>, T8PD;
	defm VRCP14SD : avx512_fp14_s<0x4D, "vrcp14sd", X86frcp14s, f64x_info>,
	VEX_W, EVEX_CD8<64, CD8VT1>, T8PD;
	defm VRSQRT14SS : avx512_fp14_s<0x4F, "vrsqrt14ss", X86frsqrt14s, f32x_info>,
	EVEX_CD8<32, CD8VT1>, T8PD;
	defm VRSQRT14SD : avx512_fp14_s<0x4F, "vrsqrt14sd", X86frsqrt14s, f64x_info>,
	VEX_W, EVEX_CD8<64, CD8VT1>, T8PD;

	/// avx512_fp14_p rcp14ps, rcp14pd, rsqrt14ps, rsqrt14pd
	multiclass avx512_fp14_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
	X86VectorVTInfo _> {
	let ExeDomain = _.ExeDomain in {
	defm r: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
	(ins _.RC:$src), OpcodeStr, "$src", "$src",
	(_.FloatVT (OpNode _.RC:$src))>, EVEX, T8PD;
	defm m: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
	(ins _.MemOp:$src), OpcodeStr, "$src", "$src",
	(OpNode (_.FloatVT
	(bitconvert (_.LdFrag addr:$src))))>, EVEX, T8PD;
	defm mb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
	(ins _.ScalarMemOp:$src), OpcodeStr,
	"${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr,
	(OpNode (_.FloatVT
	(X86VBroadcast (_.ScalarLdFrag addr:$src))))>,
	EVEX, T8PD, EVEX_B;
	}
	}

	multiclass avx512_fp14_p_vl_all<bits<8> opc, string OpcodeStr, SDNode OpNode> {
	defm PSZ : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"), OpNode, v16f32_info>,
	EVEX_V512, EVEX_CD8<32, CD8VF>;
	defm PDZ : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pd"), OpNode, v8f64_info>,
	EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;

	// Define only if AVX512VL feature is present.
	let Predicates = [HasVLX] in {
	defm PSZ128 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"),
	OpNode, v4f32x_info>,
	EVEX_V128, EVEX_CD8<32, CD8VF>;
	defm PSZ256 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"),
	OpNode, v8f32x_info>,
	EVEX_V256, EVEX_CD8<32, CD8VF>;
	defm PDZ128 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pd"),
	OpNode, v2f64x_info>,
	EVEX_V128, VEX_W, EVEX_CD8<64, CD8VF>;
	defm PDZ256 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pd"),
	OpNode, v4f64x_info>,
	EVEX_V256, VEX_W, EVEX_CD8<64, CD8VF>;
	}
	}

	defm VRSQRT14 : avx512_fp14_p_vl_all<0x4E, "vrsqrt14", X86frsqrt>;
	defm VRCP14 : avx512_fp14_p_vl_all<0x4C, "vrcp14", X86frcp>;

	/// avx512_fp28_s rcp28ss, rcp28sd, rsqrt28ss, rsqrt28sd
	multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
	SDNode OpNode> {
	let ExeDomain = _.ExeDomain in {
	defm r : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
	(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
	"$src2, $src1", "$src1, $src2",
	(OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
	(i32 FROUND_CURRENT))>;

	defm rb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
	(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
	"{sae}, $src2, $src1", "$src1, $src2, {sae}",
	(OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
	(i32 FROUND_NO_EXC))>, EVEX_B;

	defm m : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
	(ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
	"$src2, $src1", "$src1, $src2",
	(OpNode (_.VT _.RC:$src1),
	(_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))),
	(i32 FROUND_CURRENT))>;
	}
	}

	multiclass avx512_eri_s<bits<8> opc, string OpcodeStr, SDNode OpNode> {
	defm SS : avx512_fp28_s<opc, OpcodeStr#"ss", f32x_info, OpNode>,
	EVEX_CD8<32, CD8VT1>;
	defm SD : avx512_fp28_s<opc, OpcodeStr#"sd", f64x_info, OpNode>,
	EVEX_CD8<64, CD8VT1>, VEX_W;
	}

	let Predicates = [HasERI] in {
	defm VRCP28 : avx512_eri_s<0xCB, "vrcp28", X86rcp28s>, T8PD, EVEX_4V;
	defm VRSQRT28 : avx512_eri_s<0xCD, "vrsqrt28", X86rsqrt28s>, T8PD, EVEX_4V;
	}

	defm VGETEXP : avx512_eri_s<0x43, "vgetexp", X86fgetexpRnds>, T8PD, EVEX_4V;
	/// avx512_fp28_p rcp28ps, rcp28pd, rsqrt28ps, rsqrt28pd

	multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
	SDNode OpNode> {
	let ExeDomain = _.ExeDomain in {
	defm r : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
	(ins _.RC:$src), OpcodeStr, "$src", "$src",
	(OpNode (_.VT _.RC:$src), (i32 FROUND_CURRENT))>;

	defm m : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
	(ins _.MemOp:$src), OpcodeStr, "$src", "$src",
	(OpNode (_.FloatVT
	(bitconvert (_.LdFrag addr:$src))),
	(i32 FROUND_CURRENT))>;

	defm mb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
	(ins _.ScalarMemOp:$src), OpcodeStr,
	"${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr,
	(OpNode (_.FloatVT
	(X86VBroadcast (_.ScalarLdFrag addr:$src))),
	(i32 FROUND_CURRENT))>, EVEX_B;
	}
	}
	multiclass avx512_fp28_p_round<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
	SDNode OpNode> {
	let ExeDomain = _.ExeDomain in
	defm rb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
	(ins _.RC:$src), OpcodeStr,
	"{sae}, $src", "$src, {sae}",
	(OpNode (_.VT _.RC:$src), (i32 FROUND_NO_EXC))>, EVEX_B;
	}

	multiclass avx512_eri<bits<8> opc, string OpcodeStr, SDNode OpNode> {
	defm PS : avx512_fp28_p<opc, OpcodeStr#"ps", v16f32_info, OpNode>,
	avx512_fp28_p_round<opc, OpcodeStr#"ps", v16f32_info, OpNode>,
	T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>;
	defm PD : avx512_fp28_p<opc, OpcodeStr#"pd", v8f64_info, OpNode>,
	avx512_fp28_p_round<opc, OpcodeStr#"pd", v8f64_info, OpNode>,
	T8PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
	}

	multiclass avx512_fp_unaryop_packed<bits<8> opc, string OpcodeStr,
	SDNode OpNode> {
	// Define only if AVX512VL feature is present.
	let Predicates = [HasVLX] in {
	defm PSZ128 : avx512_fp28_p<opc, OpcodeStr#"ps", v4f32x_info, OpNode>,
	EVEX_V128, T8PD, EVEX_CD8<32, CD8VF>;
	defm PSZ256 : avx512_fp28_p<opc, OpcodeStr#"ps", v8f32x_info, OpNode>,
	EVEX_V256, T8PD, EVEX_CD8<32, CD8VF>;
	defm PDZ128 : avx512_fp28_p<opc, OpcodeStr#"pd", v2f64x_info, OpNode>,
	EVEX_V128, VEX_W, T8PD, EVEX_CD8<64, CD8VF>;
	defm PDZ256 : avx512_fp28_p<opc, OpcodeStr#"pd", v4f64x_info, OpNode>,
	EVEX_V256, VEX_W, T8PD, EVEX_CD8<64, CD8VF>;
	}
	}
	let Predicates = [HasERI] in {

	defm VRSQRT28 : avx512_eri<0xCC, "vrsqrt28", X86rsqrt28>, EVEX;
	defm VRCP28 : avx512_eri<0xCA, "vrcp28", X86rcp28>, EVEX;
	defm VEXP2 : avx512_eri<0xC8, "vexp2", X86exp2>, EVEX;
	}
	defm VGETEXP : avx512_eri<0x42, "vgetexp", X86fgetexpRnd>,
	avx512_fp_unaryop_packed<0x42, "vgetexp", X86fgetexpRnd> , EVEX;

	multiclass avx512_sqrt_packed_round<bits<8> opc, string OpcodeStr,
	SDNode OpNodeRnd, X86VectorVTInfo _>{
	let ExeDomain = _.ExeDomain in
	defm rb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
	(ins _.RC:$src, AVX512RC:$rc), OpcodeStr, "$rc, $src", "$src, $rc",
	(_.VT (OpNodeRnd _.RC:$src, (i32 imm:$rc)))>,
	EVEX, EVEX_B, EVEX_RC;
	}

	multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr,
	SDNode OpNode, X86VectorVTInfo _>{
	let ExeDomain = _.ExeDomain in {
	defm r: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
	(ins _.RC:$src), OpcodeStr, "$src", "$src",
	(_.FloatVT (OpNode _.RC:$src))>, EVEX;
	defm m: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
	(ins _.MemOp:$src), OpcodeStr, "$src", "$src",
	(OpNode (_.FloatVT
	(bitconvert (_.LdFrag addr:$src))))>, EVEX;

	defm mb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
	(ins _.ScalarMemOp:$src), OpcodeStr,
	"${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr,
	(OpNode (_.FloatVT
	(X86VBroadcast (_.ScalarLdFrag addr:$src))))>,
	EVEX, EVEX_B;
	}
	}

	multiclass avx512_sqrt_packed_all<bits<8> opc, string OpcodeStr,
	SDNode OpNode> {
	defm PSZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
	v16f32_info>,
	EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
	defm PDZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
	v8f64_info>,
	EVEX_V512, VEX_W, PD, EVEX_CD8<64, CD8VF>;
	// Define only if AVX512VL feature is present.
	let Predicates = [HasVLX] in {
	defm PSZ128 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"),
	OpNode, v4f32x_info>,
	EVEX_V128, PS, EVEX_CD8<32, CD8VF>;
	defm PSZ256 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"),
	OpNode, v8f32x_info>,
	EVEX_V256, PS, EVEX_CD8<32, CD8VF>;
	defm PDZ128 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pd"),
	OpNode, v2f64x_info>,
	EVEX_V128, VEX_W, PD, EVEX_CD8<64, CD8VF>;
	defm PDZ256 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pd"),
	OpNode, v4f64x_info>,
	EVEX_V256, VEX_W, PD, EVEX_CD8<64, CD8VF>;
	}
	}

	multiclass avx512_sqrt_packed_all_round<bits<8> opc, string OpcodeStr,
	SDNode OpNodeRnd> {
	defm PSZ : avx512_sqrt_packed_round<opc, !strconcat(OpcodeStr, "ps"), OpNodeRnd,
	v16f32_info>, EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
	defm PDZ : avx512_sqrt_packed_round<opc, !strconcat(OpcodeStr, "pd"), OpNodeRnd,
	v8f64_info>, EVEX_V512, VEX_W, PD, EVEX_CD8<64, CD8VF>;
	}

	multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
	string SUFF, SDNode OpNode, SDNode OpNodeRnd> {
	let ExeDomain = _.ExeDomain in {
	defm r_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
	(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
	"$src2, $src1", "$src1, $src2",
	(OpNodeRnd (_.VT _.RC:$src1),
	(_.VT _.RC:$src2),
	(i32 FROUND_CURRENT))>;
	defm m_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
	(ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
	"$src2, $src1", "$src1, $src2",
	(OpNodeRnd (_.VT _.RC:$src1),
	(_.VT (scalar_to_vector
	(_.ScalarLdFrag addr:$src2))),
	(i32 FROUND_CURRENT))>;

	defm rb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
	(ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr,
	"$rc, $src2, $src1", "$src1, $src2, $rc",
	(OpNodeRnd (_.VT _.RC:$src1),
	(_.VT _.RC:$src2),
	(i32 imm:$rc))>,
	EVEX_B, EVEX_RC;

	let isCodeGenOnly = 1, hasSideEffects = 0 in {
	def r : I<opc, MRMSrcReg, (outs _.FRC:$dst),
	(ins _.FRC:$src1, _.FRC:$src2),
	OpcodeStr#"\t{$src2, $src1, $dst\|$dst, $src1, $src2}", []>;

	let mayLoad = 1 in
	def m : I<opc, MRMSrcMem, (outs _.FRC:$dst),
	(ins _.FRC:$src1, _.ScalarMemOp:$src2),
	OpcodeStr#"\t{$src2, $src1, $dst\|$dst, $src1, $src2}", []>;
	}
	}

	def : Pat<(_.EltVT (OpNode _.FRC:$src)),
	(!cast<Instruction>(NAME#SUFF#Zr)
	(_.EltVT (IMPLICIT_DEF)), _.FRC:$src)>;

	def : Pat<(_.EltVT (OpNode (load addr:$src))),
	(!cast<Instruction>(NAME#SUFF#Zm)
	(_.EltVT (IMPLICIT_DEF)), addr:$src)>, Requires<[HasAVX512, OptForSize]>;
	}

	multiclass avx512_sqrt_scalar_all<bits<8> opc, string OpcodeStr> {
	defm SSZ : avx512_sqrt_scalar<opc, OpcodeStr#"ss", f32x_info, "SS", fsqrt,
	X86fsqrtRnds>, EVEX_CD8<32, CD8VT1>, EVEX_4V, XS;
	defm SDZ : avx512_sqrt_scalar<opc, OpcodeStr#"sd", f64x_info, "SD", fsqrt,
	X86fsqrtRnds>, EVEX_CD8<64, CD8VT1>, EVEX_4V, XD, VEX_W;
	}

	defm VSQRT : avx512_sqrt_packed_all<0x51, "vsqrt", fsqrt>,
	avx512_sqrt_packed_all_round<0x51, "vsqrt", X86fsqrtRnd>;

	defm VSQRT : avx512_sqrt_scalar_all<0x51, "vsqrt">, VEX_LIG;

	let Predicates = [HasAVX512] in {
	def : Pat<(f32 (X86frsqrt FR32X:$src)),
	(COPY_TO_REGCLASS (VRSQRT14SSrr (v4f32 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X)>;
	def : Pat<(f32 (X86frsqrt (load addr:$src))),
	(COPY_TO_REGCLASS (VRSQRT14SSrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>,
	Requires<[OptForSize]>;
	def : Pat<(f32 (X86frcp FR32X:$src)),
	(COPY_TO_REGCLASS (VRCP14SSrr (v4f32 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X )>;
	def : Pat<(f32 (X86frcp (load addr:$src))),
	(COPY_TO_REGCLASS (VRCP14SSrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>,
	Requires<[OptForSize]>;
	}

	multiclass
	avx512_rndscale_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {

	let ExeDomain = _.ExeDomain in {
	defm r : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
	(ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr,
	"$src3, $src2, $src1", "$src1, $src2, $src3",
	(_.VT (X86RndScales (_.VT _.RC:$src1), (_.VT _.RC:$src2),
	(i32 imm:$src3), (i32 FROUND_CURRENT)))>;

	defm rb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
	(ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr,
	"$src3, {sae}, $src2, $src1", "$src1, $src2, {sae}, $src3",
	(_.VT (X86RndScales (_.VT _.RC:$src1), (_.VT _.RC:$src2),
	(i32 imm:$src3), (i32 FROUND_NO_EXC)))>, EVEX_B;

	defm m : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
	(ins _.RC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3),
	OpcodeStr,
	"$src3, $src2, $src1", "$src1, $src2, $src3",
	(_.VT (X86RndScales (_.VT _.RC:$src1),
	(_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))),
	(i32 imm:$src3), (i32 FROUND_CURRENT)))>;
	}
	let Predicates = [HasAVX512] in {
	def : Pat<(ffloor _.FRC:$src), (COPY_TO_REGCLASS
	(_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)),
	(_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x9))), _.FRC)>;
	def : Pat<(fceil _.FRC:$src), (COPY_TO_REGCLASS
	(_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)),
	(_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0xa))), _.FRC)>;
	def : Pat<(ftrunc _.FRC:$src), (COPY_TO_REGCLASS
	(_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)),
	(_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0xb))), _.FRC)>;
	def : Pat<(frint _.FRC:$src), (COPY_TO_REGCLASS
	(_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)),
	(_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x4))), _.FRC)>;
	def : Pat<(fnearbyint _.FRC:$src), (COPY_TO_REGCLASS
	(_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)),
	(_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0xc))), _.FRC)>;

	def : Pat<(ffloor (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS
	(_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)),
	addr:$src, (i32 0x9))), _.FRC)>;
	def : Pat<(fceil (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS
	(_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)),
	addr:$src, (i32 0xa))), _.FRC)>;
	def : Pat<(ftrunc (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS
	(_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)),
	addr:$src, (i32 0xb))), _.FRC)>;
	def : Pat<(frint (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS
	(_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)),
	addr:$src, (i32 0x4))), _.FRC)>;
	def : Pat<(fnearbyint (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS
	(_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)),
	addr:$src, (i32 0xc))), _.FRC)>;
	}
	}

	defm VRNDSCALESS : avx512_rndscale_scalar<0x0A, "vrndscaless", f32x_info>,
	AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VT1>;

	defm VRNDSCALESD : avx512_rndscale_scalar<0x0B, "vrndscalesd", f64x_info>, VEX_W,
	AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VT1>;

	//-------------------------------------------------
	// Integer truncate and extend operations
	//-------------------------------------------------

	multiclass avx512_trunc_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
	X86VectorVTInfo SrcInfo, X86VectorVTInfo DestInfo,
	X86MemOperand x86memop> {
	let ExeDomain = DestInfo.ExeDomain in
	defm rr : AVX512_maskable<opc, MRMDestReg, DestInfo, (outs DestInfo.RC:$dst),
	(ins SrcInfo.RC:$src1), OpcodeStr ,"$src1", "$src1",
	(DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1)))>,
	EVEX, T8XS;

	// for intrinsic patter match
	def : Pat<(DestInfo.VT (X86select DestInfo.KRCWM:$mask,
	(DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1))),
	undef)),
	(!cast<Instruction>(NAME#SrcInfo.ZSuffix##rrkz) DestInfo.KRCWM:$mask ,
	SrcInfo.RC:$src1)>;

	def : Pat<(DestInfo.VT (X86select DestInfo.KRCWM:$mask,
	(DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1))),
	DestInfo.ImmAllZerosV)),
	(!cast<Instruction>(NAME#SrcInfo.ZSuffix##rrkz) DestInfo.KRCWM:$mask ,
	SrcInfo.RC:$src1)>;

	def : Pat<(DestInfo.VT (X86select DestInfo.KRCWM:$mask,
	(DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1))),
	DestInfo.RC:$src0)),
	(!cast<Instruction>(NAME#SrcInfo.ZSuffix##rrk) DestInfo.RC:$src0,
	DestInfo.KRCWM:$mask ,
	SrcInfo.RC:$src1)>;

	let mayStore = 1, mayLoad = 1, hasSideEffects = 0,
	ExeDomain = DestInfo.ExeDomain in {
	def mr : AVX512XS8I<opc, MRMDestMem, (outs),
	(ins x86memop:$dst, SrcInfo.RC:$src),
	OpcodeStr # "\t{$src, $dst\|$dst, $src}",
	[]>, EVEX;

	def mrk : AVX512XS8I<opc, MRMDestMem, (outs),
	(ins x86memop:$dst, SrcInfo.KRCWM:$mask, SrcInfo.RC:$src),
	OpcodeStr # "\t{$src, $dst {${mask}}\|$dst {${mask}}, $src}",
	[]>, EVEX, EVEX_K;
	}//mayStore = 1, mayLoad = 1, hasSideEffects = 0
	}

	multiclass avx512_trunc_mr_lowering<X86VectorVTInfo SrcInfo,
	X86VectorVTInfo DestInfo,
	PatFrag truncFrag, PatFrag mtruncFrag > {

	def : Pat<(truncFrag (SrcInfo.VT SrcInfo.RC:$src), addr:$dst),
	(!cast<Instruction>(NAME#SrcInfo.ZSuffix##mr)
	addr:$dst, SrcInfo.RC:$src)>;

	def : Pat<(mtruncFrag addr:$dst, SrcInfo.KRCWM:$mask,
	(SrcInfo.VT SrcInfo.RC:$src)),
	(!cast<Instruction>(NAME#SrcInfo.ZSuffix##mrk)
	addr:$dst, SrcInfo.KRCWM:$mask, SrcInfo.RC:$src)>;
	}

	multiclass avx512_trunc<bits<8> opc, string OpcodeStr, SDNode OpNode,
	AVX512VLVectorVTInfo VTSrcInfo, X86VectorVTInfo DestInfoZ128,
	X86VectorVTInfo DestInfoZ256, X86VectorVTInfo DestInfoZ,
	X86MemOperand x86memopZ128, X86MemOperand x86memopZ256,
	X86MemOperand x86memopZ, PatFrag truncFrag, PatFrag mtruncFrag,
	Predicate prd = HasAVX512>{

	let Predicates = [HasVLX, prd] in {
	defm Z128: avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info128,
	DestInfoZ128, x86memopZ128>,
	avx512_trunc_mr_lowering<VTSrcInfo.info128, DestInfoZ128,
	truncFrag, mtruncFrag>, EVEX_V128;

	defm Z256: avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info256,
	DestInfoZ256, x86memopZ256>,
	avx512_trunc_mr_lowering<VTSrcInfo.info256, DestInfoZ256,
	truncFrag, mtruncFrag>, EVEX_V256;
	}
	let Predicates = [prd] in
	defm Z: avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info512,
	DestInfoZ, x86memopZ>,
	avx512_trunc_mr_lowering<VTSrcInfo.info512, DestInfoZ,
	truncFrag, mtruncFrag>, EVEX_V512;
	}

	multiclass avx512_trunc_qb<bits<8> opc, string OpcodeStr, SDNode OpNode,
	PatFrag StoreNode, PatFrag MaskedStoreNode> {
	defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i64_info,
	v16i8x_info, v16i8x_info, v16i8x_info, i16mem, i32mem, i64mem,
	StoreNode, MaskedStoreNode>, EVEX_CD8<8, CD8VO>;
	}

	multiclass avx512_trunc_qw<bits<8> opc, string OpcodeStr, SDNode OpNode,
	PatFrag StoreNode, PatFrag MaskedStoreNode> {
	defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i64_info,
	v8i16x_info, v8i16x_info, v8i16x_info, i32mem, i64mem, i128mem,
	StoreNode, MaskedStoreNode>, EVEX_CD8<16, CD8VQ>;
	}

	multiclass avx512_trunc_qd<bits<8> opc, string OpcodeStr, SDNode OpNode,
	PatFrag StoreNode, PatFrag MaskedStoreNode> {
	defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i64_info,
	v4i32x_info, v4i32x_info, v8i32x_info, i64mem, i128mem, i256mem,
	StoreNode, MaskedStoreNode>, EVEX_CD8<32, CD8VH>;
	}

	multiclass avx512_trunc_db<bits<8> opc, string OpcodeStr, SDNode OpNode,
	PatFrag StoreNode, PatFrag MaskedStoreNode> {
	defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i32_info,
	v16i8x_info, v16i8x_info, v16i8x_info, i32mem, i64mem, i128mem,
	StoreNode, MaskedStoreNode>, EVEX_CD8<8, CD8VQ>;
	}

	multiclass avx512_trunc_dw<bits<8> opc, string OpcodeStr, SDNode OpNode,
	PatFrag StoreNode, PatFrag MaskedStoreNode> {
	defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i32_info,
	v8i16x_info, v8i16x_info, v16i16x_info, i64mem, i128mem, i256mem,
	StoreNode, MaskedStoreNode>, EVEX_CD8<16, CD8VH>;
	}

	multiclass avx512_trunc_wb<bits<8> opc, string OpcodeStr, SDNode OpNode,
	PatFrag StoreNode, PatFrag MaskedStoreNode> {
	defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i16_info,
	v16i8x_info, v16i8x_info, v32i8x_info, i64mem, i128mem, i256mem,
	StoreNode, MaskedStoreNode, HasBWI>, EVEX_CD8<16, CD8VH>;
	}

	defm VPMOVQB : avx512_trunc_qb<0x32, "vpmovqb", X86vtrunc,
	truncstorevi8, masked_truncstorevi8>;
	defm VPMOVSQB : avx512_trunc_qb<0x22, "vpmovsqb", X86vtruncs,
	truncstore_s_vi8, masked_truncstore_s_vi8>;
	defm VPMOVUSQB : avx512_trunc_qb<0x12, "vpmovusqb", X86vtruncus,
	truncstore_us_vi8, masked_truncstore_us_vi8>;

	defm VPMOVQW : avx512_trunc_qw<0x34, "vpmovqw", X86vtrunc,
	truncstorevi16, masked_truncstorevi16>;
	defm VPMOVSQW : avx512_trunc_qw<0x24, "vpmovsqw", X86vtruncs,
	truncstore_s_vi16, masked_truncstore_s_vi16>;
	defm VPMOVUSQW : avx512_trunc_qw<0x14, "vpmovusqw", X86vtruncus,
	truncstore_us_vi16, masked_truncstore_us_vi16>;

	defm VPMOVQD : avx512_trunc_qd<0x35, "vpmovqd", X86vtrunc,
	truncstorevi32, masked_truncstorevi32>;
	defm VPMOVSQD : avx512_trunc_qd<0x25, "vpmovsqd", X86vtruncs,
	truncstore_s_vi32, masked_truncstore_s_vi32>;
	defm VPMOVUSQD : avx512_trunc_qd<0x15, "vpmovusqd", X86vtruncus,
	truncstore_us_vi32, masked_truncstore_us_vi32>;

	defm VPMOVDB : avx512_trunc_db<0x31, "vpmovdb", X86vtrunc,
	truncstorevi8, masked_truncstorevi8>;
	defm VPMOVSDB : avx512_trunc_db<0x21, "vpmovsdb", X86vtruncs,
	truncstore_s_vi8, masked_truncstore_s_vi8>;
	defm VPMOVUSDB : avx512_trunc_db<0x11, "vpmovusdb", X86vtruncus,
	truncstore_us_vi8, masked_truncstore_us_vi8>;

	defm VPMOVDW : avx512_trunc_dw<0x33, "vpmovdw", X86vtrunc,
	truncstorevi16, masked_truncstorevi16>;
	defm VPMOVSDW : avx512_trunc_dw<0x23, "vpmovsdw", X86vtruncs,
	truncstore_s_vi16, masked_truncstore_s_vi16>;
	defm VPMOVUSDW : avx512_trunc_dw<0x13, "vpmovusdw", X86vtruncus,
	truncstore_us_vi16, masked_truncstore_us_vi16>;

	defm VPMOVWB : avx512_trunc_wb<0x30, "vpmovwb", X86vtrunc,
	truncstorevi8, masked_truncstorevi8>;
	defm VPMOVSWB : avx512_trunc_wb<0x20, "vpmovswb", X86vtruncs,
	truncstore_s_vi8, masked_truncstore_s_vi8>;
	defm VPMOVUSWB : avx512_trunc_wb<0x10, "vpmovuswb", X86vtruncus,
	truncstore_us_vi8, masked_truncstore_us_vi8>;

	let Predicates = [HasAVX512, NoVLX] in {
	def: Pat<(v8i16 (X86vtrunc (v8i32 VR256X:$src))),
	(v8i16 (EXTRACT_SUBREG
	(v16i16 (VPMOVDWZrr (v16i32 (INSERT_SUBREG (IMPLICIT_DEF),
	VR256X:$src, sub_ymm)))), sub_xmm))>;
	def: Pat<(v4i32 (X86vtrunc (v4i64 VR256X:$src))),
	(v4i32 (EXTRACT_SUBREG
	(v8i32 (VPMOVQDZrr (v8i64 (INSERT_SUBREG (IMPLICIT_DEF),
	VR256X:$src, sub_ymm)))), sub_xmm))>;
	}

	let Predicates = [HasBWI, NoVLX] in {
	def: Pat<(v16i8 (X86vtrunc (v16i16 VR256X:$src))),
	(v16i8 (EXTRACT_SUBREG (VPMOVWBZrr (v32i16 (INSERT_SUBREG (IMPLICIT_DEF),
	VR256X:$src, sub_ymm))), sub_xmm))>;
	}

	multiclass avx512_extend_common<bits<8> opc, string OpcodeStr,
	X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo,
	X86MemOperand x86memop, PatFrag LdFrag, SDPatternOperator OpNode>{
	let ExeDomain = DestInfo.ExeDomain in {
	defm rr : AVX512_maskable<opc, MRMSrcReg, DestInfo, (outs DestInfo.RC:$dst),
	(ins SrcInfo.RC:$src), OpcodeStr ,"$src", "$src",
	(DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src)))>,
	EVEX;

	defm rm : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst),
	(ins x86memop:$src), OpcodeStr ,"$src", "$src",
	(DestInfo.VT (LdFrag addr:$src))>,
	EVEX;
	}
	}

	multiclass avx512_extend_BW<bits<8> opc, string OpcodeStr,
	SDPatternOperator OpNode, SDPatternOperator InVecNode,
	string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> {
	let Predicates = [HasVLX, HasBWI] in {
	defm Z128: avx512_extend_common<opc, OpcodeStr, v8i16x_info,
	v16i8x_info, i64mem, LdFrag, InVecNode>,
	EVEX_CD8<8, CD8VH>, T8PD, EVEX_V128;

	defm Z256: avx512_extend_common<opc, OpcodeStr, v16i16x_info,
	v16i8x_info, i128mem, LdFrag, OpNode>,
	EVEX_CD8<8, CD8VH>, T8PD, EVEX_V256;
	}
	let Predicates = [HasBWI] in {
	defm Z : avx512_extend_common<opc, OpcodeStr, v32i16_info,
	v32i8x_info, i256mem, LdFrag, OpNode>,
	EVEX_CD8<8, CD8VH>, T8PD, EVEX_V512;
	}
	}

	multiclass avx512_extend_BD<bits<8> opc, string OpcodeStr,
	SDPatternOperator OpNode, SDPatternOperator InVecNode,
	string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> {
	let Predicates = [HasVLX, HasAVX512] in {
	defm Z128: avx512_extend_common<opc, OpcodeStr, v4i32x_info,
	v16i8x_info, i32mem, LdFrag, InVecNode>,
	EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V128;

	defm Z256: avx512_extend_common<opc, OpcodeStr, v8i32x_info,
	v16i8x_info, i64mem, LdFrag, OpNode>,
	EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V256;
	}
	let Predicates = [HasAVX512] in {
	defm Z : avx512_extend_common<opc, OpcodeStr, v16i32_info,
	v16i8x_info, i128mem, LdFrag, OpNode>,
	EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V512;
	}
	}

	multiclass avx512_extend_BQ<bits<8> opc, string OpcodeStr,
	SDPatternOperator OpNode, SDPatternOperator InVecNode,
	string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> {
	let Predicates = [HasVLX, HasAVX512] in {
	defm Z128: avx512_extend_common<opc, OpcodeStr, v2i64x_info,
	v16i8x_info, i16mem, LdFrag, InVecNode>,
	EVEX_CD8<8, CD8VO>, T8PD, EVEX_V128;

	defm Z256: avx512_extend_common<opc, OpcodeStr, v4i64x_info,
	v16i8x_info, i32mem, LdFrag, OpNode>,
	EVEX_CD8<8, CD8VO>, T8PD, EVEX_V256;
	}
	let Predicates = [HasAVX512] in {
	defm Z : avx512_extend_common<opc, OpcodeStr, v8i64_info,
	v16i8x_info, i64mem, LdFrag, OpNode>,
	EVEX_CD8<8, CD8VO>, T8PD, EVEX_V512;
	}
	}

	multiclass avx512_extend_WD<bits<8> opc, string OpcodeStr,
	SDPatternOperator OpNode, SDPatternOperator InVecNode,
	string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi16")> {
	let Predicates = [HasVLX, HasAVX512] in {
	defm Z128: avx512_extend_common<opc, OpcodeStr, v4i32x_info,
	v8i16x_info, i64mem, LdFrag, InVecNode>,
	EVEX_CD8<16, CD8VH>, T8PD, EVEX_V128;

	defm Z256: avx512_extend_common<opc, OpcodeStr, v8i32x_info,
	v8i16x_info, i128mem, LdFrag, OpNode>,
	EVEX_CD8<16, CD8VH>, T8PD, EVEX_V256;
	}
	let Predicates = [HasAVX512] in {
	defm Z : avx512_extend_common<opc, OpcodeStr, v16i32_info,
	v16i16x_info, i256mem, LdFrag, OpNode>,
	EVEX_CD8<16, CD8VH>, T8PD, EVEX_V512;
	}
	}

	multiclass avx512_extend_WQ<bits<8> opc, string OpcodeStr,
	SDPatternOperator OpNode, SDPatternOperator InVecNode,
	string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi16")> {
	let Predicates = [HasVLX, HasAVX512] in {
	defm Z128: avx512_extend_common<opc, OpcodeStr, v2i64x_info,
	v8i16x_info, i32mem, LdFrag, InVecNode>,
	EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V128;

	defm Z256: avx512_extend_common<opc, OpcodeStr, v4i64x_info,
	v8i16x_info, i64mem, LdFrag, OpNode>,
	EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V256;
	}
	let Predicates = [HasAVX512] in {
	defm Z : avx512_extend_common<opc, OpcodeStr, v8i64_info,
	v8i16x_info, i128mem, LdFrag, OpNode>,
	EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V512;
	}
	}

	multiclass avx512_extend_DQ<bits<8> opc, string OpcodeStr,
	SDPatternOperator OpNode, SDPatternOperator InVecNode,
	string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi32")> {

	let Predicates = [HasVLX, HasAVX512] in {
	defm Z128: avx512_extend_common<opc, OpcodeStr, v2i64x_info,
	v4i32x_info, i64mem, LdFrag, InVecNode>,
	EVEX_CD8<32, CD8VH>, T8PD, EVEX_V128;

	defm Z256: avx512_extend_common<opc, OpcodeStr, v4i64x_info,
	v4i32x_info, i128mem, LdFrag, OpNode>,
	EVEX_CD8<32, CD8VH>, T8PD, EVEX_V256;
	}
	let Predicates = [HasAVX512] in {
	defm Z : avx512_extend_common<opc, OpcodeStr, v8i64_info,
	v8i32x_info, i256mem, LdFrag, OpNode>,
	EVEX_CD8<32, CD8VH>, T8PD, EVEX_V512;
	}
	}

	defm VPMOVZXBW : avx512_extend_BW<0x30, "vpmovzxbw", X86vzext, zext_invec, "z">;
	defm VPMOVZXBD : avx512_extend_BD<0x31, "vpmovzxbd", X86vzext, zext_invec, "z">;
	defm VPMOVZXBQ : avx512_extend_BQ<0x32, "vpmovzxbq", X86vzext, zext_invec, "z">;
	defm VPMOVZXWD : avx512_extend_WD<0x33, "vpmovzxwd", X86vzext, zext_invec, "z">;
	defm VPMOVZXWQ : avx512_extend_WQ<0x34, "vpmovzxwq", X86vzext, zext_invec, "z">;
	defm VPMOVZXDQ : avx512_extend_DQ<0x35, "vpmovzxdq", X86vzext, zext_invec, "z">;

	defm VPMOVSXBW: avx512_extend_BW<0x20, "vpmovsxbw", X86vsext, sext_invec, "s">;
	defm VPMOVSXBD: avx512_extend_BD<0x21, "vpmovsxbd", X86vsext, sext_invec, "s">;
	defm VPMOVSXBQ: avx512_extend_BQ<0x22, "vpmovsxbq", X86vsext, sext_invec, "s">;
	defm VPMOVSXWD: avx512_extend_WD<0x23, "vpmovsxwd", X86vsext, sext_invec, "s">;
	defm VPMOVSXWQ: avx512_extend_WQ<0x24, "vpmovsxwq", X86vsext, sext_invec, "s">;
	defm VPMOVSXDQ: avx512_extend_DQ<0x25, "vpmovsxdq", X86vsext, sext_invec, "s">;

	// EXTLOAD patterns, implemented using vpmovz
	multiclass avx512_ext_lowering<string InstrStr, X86VectorVTInfo To,
	X86VectorVTInfo From, PatFrag LdFrag> {
	def : Pat<(To.VT (LdFrag addr:$src)),
	(!cast<Instruction>("VPMOVZX"#InstrStr#"rm") addr:$src)>;
	def : Pat<(To.VT (vselect To.KRCWM:$mask, (LdFrag addr:$src), To.RC:$src0)),
	(!cast<Instruction>("VPMOVZX"#InstrStr#"rmk") To.RC:$src0,
	To.KRC:$mask, addr:$src)>;
	def : Pat<(To.VT (vselect To.KRCWM:$mask, (LdFrag addr:$src),
	To.ImmAllZerosV)),
	(!cast<Instruction>("VPMOVZX"#InstrStr#"rmkz") To.KRC:$mask,
	addr:$src)>;
	}

	let Predicates = [HasVLX, HasBWI] in {
	defm : avx512_ext_lowering<"BWZ128", v8i16x_info, v16i8x_info, extloadvi8>;
	defm : avx512_ext_lowering<"BWZ256", v16i16x_info, v16i8x_info, extloadvi8>;
	}
	let Predicates = [HasBWI] in {
	defm : avx512_ext_lowering<"BWZ", v32i16_info, v32i8x_info, extloadvi8>;
	}
	let Predicates = [HasVLX, HasAVX512] in {
	defm : avx512_ext_lowering<"BDZ128", v4i32x_info, v16i8x_info, extloadvi8>;
	defm : avx512_ext_lowering<"BDZ256", v8i32x_info, v16i8x_info, extloadvi8>;
	defm : avx512_ext_lowering<"BQZ128", v2i64x_info, v16i8x_info, extloadvi8>;
	defm : avx512_ext_lowering<"BQZ256", v4i64x_info, v16i8x_info, extloadvi8>;
	defm : avx512_ext_lowering<"WDZ128", v4i32x_info, v8i16x_info, extloadvi16>;
	defm : avx512_ext_lowering<"WDZ256", v8i32x_info, v8i16x_info, extloadvi16>;
	defm : avx512_ext_lowering<"WQZ128", v2i64x_info, v8i16x_info, extloadvi16>;
	defm : avx512_ext_lowering<"WQZ256", v4i64x_info, v8i16x_info, extloadvi16>;
	defm : avx512_ext_lowering<"DQZ128", v2i64x_info, v4i32x_info, extloadvi32>;
	defm : avx512_ext_lowering<"DQZ256", v4i64x_info, v4i32x_info, extloadvi32>;
	}
	let Predicates = [HasAVX512] in {
	defm : avx512_ext_lowering<"BDZ", v16i32_info, v16i8x_info, extloadvi8>;
	defm : avx512_ext_lowering<"BQZ", v8i64_info, v16i8x_info, extloadvi8>;
	defm : avx512_ext_lowering<"WDZ", v16i32_info, v16i16x_info, extloadvi16>;
	defm : avx512_ext_lowering<"WQZ", v8i64_info, v8i16x_info, extloadvi16>;
	defm : avx512_ext_lowering<"DQZ", v8i64_info, v8i32x_info, extloadvi32>;
	}

	multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
	SDNode InVecOp, PatFrag ExtLoad16> {
	// 128-bit patterns
	let Predicates = [HasVLX, HasBWI] in {
	def : Pat<(v8i16 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
	(!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
	def : Pat<(v8i16 (InVecOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
	(!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
	def : Pat<(v8i16 (InVecOp (v16i8 (vzmovl_v2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
	def : Pat<(v8i16 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
	def : Pat<(v8i16 (InVecOp (bc_v16i8 (loadv2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
	}
	let Predicates = [HasVLX] in {
	def : Pat<(v4i32 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
	(!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;
	def : Pat<(v4i32 (InVecOp (v16i8 (vzmovl_v4i32 addr:$src)))),
	(!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;
	def : Pat<(v4i32 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;
	def : Pat<(v4i32 (InVecOp (bc_v16i8 (loadv2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;

	def : Pat<(v2i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (ExtLoad16 addr:$src)))))),
	(!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>;
	def : Pat<(v2i64 (InVecOp (v16i8 (vzmovl_v4i32 addr:$src)))),
	(!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>;
	def : Pat<(v2i64 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>;
	def : Pat<(v2i64 (InVecOp (bc_v16i8 (loadv2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>;

	def : Pat<(v4i32 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
	(!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
	def : Pat<(v4i32 (InVecOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
	(!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
	def : Pat<(v4i32 (InVecOp (v8i16 (vzmovl_v2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
	def : Pat<(v4i32 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
	def : Pat<(v4i32 (InVecOp (bc_v8i16 (loadv2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;

	def : Pat<(v2i64 (InVecOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
	(!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>;
	def : Pat<(v2i64 (InVecOp (v8i16 (vzmovl_v4i32 addr:$src)))),
	(!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>;
	def : Pat<(v2i64 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>;
	def : Pat<(v2i64 (InVecOp (bc_v8i16 (loadv2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>;

	def : Pat<(v2i64 (InVecOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
	(!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
	def : Pat<(v2i64 (InVecOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
	(!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
	def : Pat<(v2i64 (InVecOp (v4i32 (vzmovl_v2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
	def : Pat<(v2i64 (InVecOp (v4i32 (vzload_v2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
	def : Pat<(v2i64 (InVecOp (bc_v4i32 (loadv2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
	}
	// 256-bit patterns
	let Predicates = [HasVLX, HasBWI] in {
	def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#BWZ256rm) addr:$src)>;
	def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#BWZ256rm) addr:$src)>;
	def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#BWZ256rm) addr:$src)>;
	}
	let Predicates = [HasVLX] in {
	def : Pat<(v8i32 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
	(!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>;
	def : Pat<(v8i32 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>;
	def : Pat<(v8i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>;
	def : Pat<(v8i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>;

	def : Pat<(v4i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
	(!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>;
	def : Pat<(v4i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
	(!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>;
	def : Pat<(v4i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>;
	def : Pat<(v4i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>;

	def : Pat<(v8i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#WDZ256rm) addr:$src)>;
	def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#WDZ256rm) addr:$src)>;
	def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#WDZ256rm) addr:$src)>;

	def : Pat<(v4i64 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
	(!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>;
	def : Pat<(v4i64 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>;
	def : Pat<(v4i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>;
	def : Pat<(v4i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>;

	def : Pat<(v4i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#DQZ256rm) addr:$src)>;
	def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#DQZ256rm) addr:$src)>;
	def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#DQZ256rm) addr:$src)>;
	}
	// 512-bit patterns
	let Predicates = [HasBWI] in {
	def : Pat<(v32i16 (ExtOp (bc_v32i8 (loadv4i64 addr:$src)))),
	(!cast<I>(OpcPrefix#BWZrm) addr:$src)>;
	}
	let Predicates = [HasAVX512] in {
	def : Pat<(v16i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#BDZrm) addr:$src)>;

	def : Pat<(v8i64 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
	(!cast<I>(OpcPrefix#BQZrm) addr:$src)>;
	def : Pat<(v8i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#BQZrm) addr:$src)>;

	def : Pat<(v16i32 (ExtOp (bc_v16i16 (loadv4i64 addr:$src)))),
	(!cast<I>(OpcPrefix#WDZrm) addr:$src)>;

	def : Pat<(v8i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#WQZrm) addr:$src)>;

	def : Pat<(v8i64 (ExtOp (bc_v8i32 (loadv4i64 addr:$src)))),
	(!cast<I>(OpcPrefix#DQZrm) addr:$src)>;
	}
	}

	defm : AVX512_pmovx_patterns<"VPMOVSX", X86vsext, sext_invec, extloadi32i16>;
	defm : AVX512_pmovx_patterns<"VPMOVZX", X86vzext, zext_invec, loadi16_anyext>;

	//===----------------------------------------------------------------------===//
	// GATHER - SCATTER Operations

	multiclass avx512_gather<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
	X86MemOperand memop, PatFrag GatherNode> {
	let Constraints = "@earlyclobber $dst, $src1 = $dst, $mask = $mask_wb",
	ExeDomain = _.ExeDomain in
	def rm : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst, _.KRCWM:$mask_wb),
	(ins _.RC:$src1, _.KRCWM:$mask, memop:$src2),
	!strconcat(OpcodeStr#_.Suffix,
	"\t{$src2, ${dst} {${mask}}\|${dst} {${mask}}, $src2}"),
	[(set _.RC:$dst, _.KRCWM:$mask_wb,
	(GatherNode (_.VT _.RC:$src1), _.KRCWM:$mask,
	vectoraddr:$src2))]>, EVEX, EVEX_K,
	EVEX_CD8<_.EltSize, CD8VT1>;
	}

	multiclass avx512_gather_q_pd<bits<8> dopc, bits<8> qopc,
	AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> {
	defm NAME##D##SUFF##Z: avx512_gather<dopc, OpcodeStr##"d", _.info512,
	vy512mem, mgatherv8i32>, EVEX_V512, VEX_W;
	defm NAME##Q##SUFF##Z: avx512_gather<qopc, OpcodeStr##"q", _.info512,
	vz512mem, mgatherv8i64>, EVEX_V512, VEX_W;
	let Predicates = [HasVLX] in {
	defm NAME##D##SUFF##Z256: avx512_gather<dopc, OpcodeStr##"d", _.info256,
	vx256xmem, mgatherv4i32>, EVEX_V256, VEX_W;
	defm NAME##Q##SUFF##Z256: avx512_gather<qopc, OpcodeStr##"q", _.info256,
	vy256xmem, mgatherv4i64>, EVEX_V256, VEX_W;
	defm NAME##D##SUFF##Z128: avx512_gather<dopc, OpcodeStr##"d", _.info128,
	vx128xmem, mgatherv4i32>, EVEX_V128, VEX_W;
	defm NAME##Q##SUFF##Z128: avx512_gather<qopc, OpcodeStr##"q", _.info128,
	vx128xmem, mgatherv2i64>, EVEX_V128, VEX_W;
	}
	}

	multiclass avx512_gather_d_ps<bits<8> dopc, bits<8> qopc,
	AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> {
	defm NAME##D##SUFF##Z: avx512_gather<dopc, OpcodeStr##"d", _.info512, vz512mem,
	mgatherv16i32>, EVEX_V512;
	defm NAME##Q##SUFF##Z: avx512_gather<qopc, OpcodeStr##"q", _.info256, vz256xmem,
	mgatherv8i64>, EVEX_V512;
	let Predicates = [HasVLX] in {
	defm NAME##D##SUFF##Z256: avx512_gather<dopc, OpcodeStr##"d", _.info256,
	vy256xmem, mgatherv8i32>, EVEX_V256;
	defm NAME##Q##SUFF##Z256: avx512_gather<qopc, OpcodeStr##"q", _.info128,
	vy128xmem, mgatherv4i64>, EVEX_V256;
	defm NAME##D##SUFF##Z128: avx512_gather<dopc, OpcodeStr##"d", _.info128,
	vx128xmem, mgatherv4i32>, EVEX_V128;
	defm NAME##Q##SUFF##Z128: avx512_gather<qopc, OpcodeStr##"q", _.info128,
	vx64xmem, X86mgatherv2i64>, EVEX_V128;
	}
	}


	defm VGATHER : avx512_gather_q_pd<0x92, 0x93, avx512vl_f64_info, "vgather", "PD">,
	avx512_gather_d_ps<0x92, 0x93, avx512vl_f32_info, "vgather", "PS">;

	defm VPGATHER : avx512_gather_q_pd<0x90, 0x91, avx512vl_i64_info, "vpgather", "Q">,
	avx512_gather_d_ps<0x90, 0x91, avx512vl_i32_info, "vpgather", "D">;

	multiclass avx512_scatter<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
	X86MemOperand memop, PatFrag ScatterNode> {

	let mayStore = 1, Constraints = "$mask = $mask_wb", ExeDomain = _.ExeDomain in

	def mr : AVX5128I<opc, MRMDestMem, (outs _.KRCWM:$mask_wb),
	(ins memop:$dst, _.KRCWM:$mask, _.RC:$src),
	!strconcat(OpcodeStr#_.Suffix,
	"\t{$src, ${dst} {${mask}}\|${dst} {${mask}}, $src}"),
	[(set _.KRCWM:$mask_wb, (ScatterNode (_.VT _.RC:$src),
	_.KRCWM:$mask, vectoraddr:$dst))]>,
	EVEX, EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>;
	}

	multiclass avx512_scatter_q_pd<bits<8> dopc, bits<8> qopc,
	AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> {
	defm NAME##D##SUFF##Z: avx512_scatter<dopc, OpcodeStr##"d", _.info512,
	vy512mem, mscatterv8i32>, EVEX_V512, VEX_W;
	defm NAME##Q##SUFF##Z: avx512_scatter<qopc, OpcodeStr##"q", _.info512,
	vz512mem, mscatterv8i64>, EVEX_V512, VEX_W;
	let Predicates = [HasVLX] in {
	defm NAME##D##SUFF##Z256: avx512_scatter<dopc, OpcodeStr##"d", _.info256,
	vx256xmem, mscatterv4i32>, EVEX_V256, VEX_W;
	defm NAME##Q##SUFF##Z256: avx512_scatter<qopc, OpcodeStr##"q", _.info256,
	vy256xmem, mscatterv4i64>, EVEX_V256, VEX_W;
	defm NAME##D##SUFF##Z128: avx512_scatter<dopc, OpcodeStr##"d", _.info128,
	vx128xmem, mscatterv4i32>, EVEX_V128, VEX_W;
	defm NAME##Q##SUFF##Z128: avx512_scatter<qopc, OpcodeStr##"q", _.info128,
	vx128xmem, mscatterv2i64>, EVEX_V128, VEX_W;
	}
	}

	multiclass avx512_scatter_d_ps<bits<8> dopc, bits<8> qopc,
	AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> {
	defm NAME##D##SUFF##Z: avx512_scatter<dopc, OpcodeStr##"d", _.info512, vz512mem,
	mscatterv16i32>, EVEX_V512;
	defm NAME##Q##SUFF##Z: avx512_scatter<qopc, OpcodeStr##"q", _.info256, vz256xmem,
	mscatterv8i64>, EVEX_V512;
	let Predicates = [HasVLX] in {
	defm NAME##D##SUFF##Z256: avx512_scatter<dopc, OpcodeStr##"d", _.info256,
	vy256xmem, mscatterv8i32>, EVEX_V256;
	defm NAME##Q##SUFF##Z256: avx512_scatter<qopc, OpcodeStr##"q", _.info128,
	vy128xmem, mscatterv4i64>, EVEX_V256;
	defm NAME##D##SUFF##Z128: avx512_scatter<dopc, OpcodeStr##"d", _.info128,
	vx128xmem, mscatterv4i32>, EVEX_V128;
	defm NAME##Q##SUFF##Z128: avx512_scatter<qopc, OpcodeStr##"q", _.info128,
	vx64xmem, mscatterv2i64>, EVEX_V128;
	}
	}

	defm VSCATTER : avx512_scatter_q_pd<0xA2, 0xA3, avx512vl_f64_info, "vscatter", "PD">,
	avx512_scatter_d_ps<0xA2, 0xA3, avx512vl_f32_info, "vscatter", "PS">;

	defm VPSCATTER : avx512_scatter_q_pd<0xA0, 0xA1, avx512vl_i64_info, "vpscatter", "Q">,
	avx512_scatter_d_ps<0xA0, 0xA1, avx512vl_i32_info, "vpscatter", "D">;

	// prefetch
	multiclass avx512_gather_scatter_prefetch<bits<8> opc, Format F, string OpcodeStr,
	RegisterClass KRC, X86MemOperand memop> {
	let Predicates = [HasPFI], hasSideEffects = 1 in
	def m : AVX5128I<opc, F, (outs), (ins KRC:$mask, memop:$src),
	!strconcat(OpcodeStr, "\t{$src {${mask}}\|{${mask}}, $src}"),
	[]>, EVEX, EVEX_K;
	}

	defm VGATHERPF0DPS: avx512_gather_scatter_prefetch<0xC6, MRM1m, "vgatherpf0dps",
	VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;

	defm VGATHERPF0QPS: avx512_gather_scatter_prefetch<0xC7, MRM1m, "vgatherpf0qps",
	VK8WM, vz256xmem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;

	defm VGATHERPF0DPD: avx512_gather_scatter_prefetch<0xC6, MRM1m, "vgatherpf0dpd",
	VK8WM, vy512mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;

	defm VGATHERPF0QPD: avx512_gather_scatter_prefetch<0xC7, MRM1m, "vgatherpf0qpd",
	VK8WM, vz512mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;

	defm VGATHERPF1DPS: avx512_gather_scatter_prefetch<0xC6, MRM2m, "vgatherpf1dps",
	VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;

	defm VGATHERPF1QPS: avx512_gather_scatter_prefetch<0xC7, MRM2m, "vgatherpf1qps",
	VK8WM, vz256xmem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;

	defm VGATHERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM2m, "vgatherpf1dpd",
	VK8WM, vy512mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;

	defm VGATHERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM2m, "vgatherpf1qpd",
	VK8WM, vz512mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;

	defm VSCATTERPF0DPS: avx512_gather_scatter_prefetch<0xC6, MRM5m, "vscatterpf0dps",
	VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;

	defm VSCATTERPF0QPS: avx512_gather_scatter_prefetch<0xC7, MRM5m, "vscatterpf0qps",
	VK8WM, vz256xmem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;

	defm VSCATTERPF0DPD: avx512_gather_scatter_prefetch<0xC6, MRM5m, "vscatterpf0dpd",
	VK8WM, vy512mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;

	defm VSCATTERPF0QPD: avx512_gather_scatter_prefetch<0xC7, MRM5m, "vscatterpf0qpd",
	VK8WM, vz512mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;

	defm VSCATTERPF1DPS: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dps",
	VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;

	defm VSCATTERPF1QPS: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qps",
	VK8WM, vz256xmem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;

	defm VSCATTERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dpd",
	VK8WM, vy512mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;

	defm VSCATTERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qpd",
	VK8WM, vz512mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;

	// Helper fragments to match sext vXi1 to vXiY.
	def v64i1sextv64i8 : PatLeaf<(v64i8
	(X86vsext
	(v64i1 (X86pcmpgtm
	(bc_v64i8 (v16i32 immAllZerosV)),
	VR512:$src))))>;
	def v32i1sextv32i16 : PatLeaf<(v32i16 (X86vsrai VR512:$src, (i8 15)))>;
	def v16i1sextv16i32 : PatLeaf<(v16i32 (X86vsrai VR512:$src, (i8 31)))>;
	def v8i1sextv8i64 : PatLeaf<(v8i64 (X86vsrai VR512:$src, (i8 63)))>;

	multiclass cvt_by_vec_width<bits<8> opc, X86VectorVTInfo Vec, string OpcodeStr > {
	def rr : AVX512XS8I<opc, MRMSrcReg, (outs Vec.RC:$dst), (ins Vec.KRC:$src),
	!strconcat(OpcodeStr##Vec.Suffix, "\t{$src, $dst\|$dst, $src}"),
	[(set Vec.RC:$dst, (Vec.VT (X86vsext Vec.KRC:$src)))]>, EVEX;
	}

	// Use 512bit version to implement 128/256 bit in case NoVLX.
	multiclass avx512_convert_mask_to_vector_lowering<X86VectorVTInfo X86Info,
	X86VectorVTInfo _> {

	def : Pat<(X86Info.VT (X86vsext (X86Info.KVT X86Info.KRC:$src))),
	(X86Info.VT (EXTRACT_SUBREG
	(_.VT (!cast<Instruction>(NAME#"Zrr")
	(_.KVT (COPY_TO_REGCLASS X86Info.KRC:$src,_.KRC)))),
	X86Info.SubRegIdx))>;
	}

	multiclass cvt_mask_by_elt_width<bits<8> opc, AVX512VLVectorVTInfo VTInfo,
	string OpcodeStr, Predicate prd> {
	let Predicates = [prd] in
	defm Z : cvt_by_vec_width<opc, VTInfo.info512, OpcodeStr>, EVEX_V512;

	let Predicates = [prd, HasVLX] in {
	defm Z256 : cvt_by_vec_width<opc, VTInfo.info256, OpcodeStr>, EVEX_V256;
	defm Z128 : cvt_by_vec_width<opc, VTInfo.info128, OpcodeStr>, EVEX_V128;
	}
	let Predicates = [prd, NoVLX] in {
	defm Z256_Alt : avx512_convert_mask_to_vector_lowering<VTInfo.info256,VTInfo.info512>;
	defm Z128_Alt : avx512_convert_mask_to_vector_lowering<VTInfo.info128,VTInfo.info512>;
	}

	}

	defm VPMOVM2B : cvt_mask_by_elt_width<0x28, avx512vl_i8_info, "vpmovm2" , HasBWI>;
	defm VPMOVM2W : cvt_mask_by_elt_width<0x28, avx512vl_i16_info, "vpmovm2", HasBWI> , VEX_W;
	defm VPMOVM2D : cvt_mask_by_elt_width<0x38, avx512vl_i32_info, "vpmovm2", HasDQI>;
	defm VPMOVM2Q : cvt_mask_by_elt_width<0x38, avx512vl_i64_info, "vpmovm2", HasDQI> , VEX_W;

	multiclass convert_vector_to_mask_common<bits<8> opc, X86VectorVTInfo _, string OpcodeStr > {
	def rr : AVX512XS8I<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.RC:$src),
	!strconcat(OpcodeStr, "\t{$src, $dst\|$dst, $src}"),
	[(set _.KRC:$dst, (X86cvt2mask (_.VT _.RC:$src)))]>, EVEX;
	}

	// Use 512bit version to implement 128/256 bit in case NoVLX.
	multiclass convert_vector_to_mask_lowering<X86VectorVTInfo ExtendInfo,
	X86VectorVTInfo _> {

	def : Pat<(_.KVT (X86cvt2mask (_.VT _.RC:$src))),
	(_.KVT (COPY_TO_REGCLASS
	(!cast<Instruction>(NAME#"Zrr")
	(INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
	_.RC:$src, _.SubRegIdx)),
	_.KRC))>;
	}

	multiclass avx512_convert_vector_to_mask<bits<8> opc, string OpcodeStr,
	AVX512VLVectorVTInfo VTInfo, Predicate prd> {
	let Predicates = [prd] in
	defm Z : convert_vector_to_mask_common <opc, VTInfo.info512, OpcodeStr>,
	EVEX_V512;

	let Predicates = [prd, HasVLX] in {
	defm Z256 : convert_vector_to_mask_common<opc, VTInfo.info256, OpcodeStr>,
	EVEX_V256;
	defm Z128 : convert_vector_to_mask_common<opc, VTInfo.info128, OpcodeStr>,
	EVEX_V128;
	}
	let Predicates = [prd, NoVLX] in {
	defm Z256_Alt : convert_vector_to_mask_lowering<VTInfo.info512, VTInfo.info256>;
	defm Z128_Alt : convert_vector_to_mask_lowering<VTInfo.info512, VTInfo.info128>;
	}
	}

	defm VPMOVB2M : avx512_convert_vector_to_mask<0x29, "vpmovb2m",
	avx512vl_i8_info, HasBWI>;
	defm VPMOVW2M : avx512_convert_vector_to_mask<0x29, "vpmovw2m",
	avx512vl_i16_info, HasBWI>, VEX_W;
	defm VPMOVD2M : avx512_convert_vector_to_mask<0x39, "vpmovd2m",
	avx512vl_i32_info, HasDQI>;
	defm VPMOVQ2M : avx512_convert_vector_to_mask<0x39, "vpmovq2m",
	avx512vl_i64_info, HasDQI>, VEX_W;

	//===----------------------------------------------------------------------===//
	// AVX-512 - COMPRESS and EXPAND
	//

	multiclass compress_by_vec_width_common<bits<8> opc, X86VectorVTInfo _,
	string OpcodeStr> {
	defm rr : AVX512_maskable<opc, MRMDestReg, _, (outs _.RC:$dst),
	(ins _.RC:$src1), OpcodeStr, "$src1", "$src1",
	(_.VT (X86compress _.RC:$src1))>, AVX5128IBase;

	let mayStore = 1, hasSideEffects = 0 in
	def mr : AVX5128I<opc, MRMDestMem, (outs),
	(ins _.MemOp:$dst, _.RC:$src),
	OpcodeStr # "\t{$src, $dst\|$dst, $src}",
	[]>, EVEX_CD8<_.EltSize, CD8VT1>;

	def mrk : AVX5128I<opc, MRMDestMem, (outs),
	(ins _.MemOp:$dst, _.KRCWM:$mask, _.RC:$src),
	OpcodeStr # "\t{$src, $dst {${mask}}\|$dst {${mask}}, $src}",
	[]>,
	EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>;
	}

	multiclass compress_by_vec_width_lowering<X86VectorVTInfo _ > {

	def : Pat<(X86mCompressingStore addr:$dst, _.KRCWM:$mask,
	(_.VT _.RC:$src)),
	(!cast<Instruction>(NAME#_.ZSuffix##mrk)
	addr:$dst, _.KRCWM:$mask, _.RC:$src)>;
	}

	multiclass compress_by_elt_width<bits<8> opc, string OpcodeStr,
	AVX512VLVectorVTInfo VTInfo> {
	defm Z : compress_by_vec_width_common<opc, VTInfo.info512, OpcodeStr>,
	compress_by_vec_width_lowering<VTInfo.info512>, EVEX_V512;

	let Predicates = [HasVLX] in {
	defm Z256 : compress_by_vec_width_common<opc, VTInfo.info256, OpcodeStr>,
	compress_by_vec_width_lowering<VTInfo.info256>, EVEX_V256;
	defm Z128 : compress_by_vec_width_common<opc, VTInfo.info128, OpcodeStr>,
	compress_by_vec_width_lowering<VTInfo.info128>, EVEX_V128;
	}
	}

	defm VPCOMPRESSD : compress_by_elt_width <0x8B, "vpcompressd", avx512vl_i32_info>,
	EVEX;
	defm VPCOMPRESSQ : compress_by_elt_width <0x8B, "vpcompressq", avx512vl_i64_info>,
	EVEX, VEX_W;
	defm VCOMPRESSPS : compress_by_elt_width <0x8A, "vcompressps", avx512vl_f32_info>,
	EVEX;
	defm VCOMPRESSPD : compress_by_elt_width <0x8A, "vcompresspd", avx512vl_f64_info>,
	EVEX, VEX_W;

	// expand
	multiclass expand_by_vec_width<bits<8> opc, X86VectorVTInfo _,
	string OpcodeStr> {
	defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
	(ins _.RC:$src1), OpcodeStr, "$src1", "$src1",
	(_.VT (X86expand _.RC:$src1))>, AVX5128IBase;

	defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
	(ins _.MemOp:$src1), OpcodeStr, "$src1", "$src1",
	(_.VT (X86expand (_.VT (bitconvert
	(_.LdFrag addr:$src1)))))>,
	AVX5128IBase, EVEX_CD8<_.EltSize, CD8VT1>;
	}

	multiclass expand_by_vec_width_lowering<X86VectorVTInfo _ > {

	def : Pat<(_.VT (X86mExpandingLoad addr:$src, _.KRCWM:$mask, undef)),
	(!cast<Instruction>(NAME#_.ZSuffix##rmkz)
	_.KRCWM:$mask, addr:$src)>;

	def : Pat<(_.VT (X86mExpandingLoad addr:$src, _.KRCWM:$mask,
	(_.VT _.RC:$src0))),
	(!cast<Instruction>(NAME#_.ZSuffix##rmk)
	_.RC:$src0, _.KRCWM:$mask, addr:$src)>;
	}

	multiclass expand_by_elt_width<bits<8> opc, string OpcodeStr,
	AVX512VLVectorVTInfo VTInfo> {
	defm Z : expand_by_vec_width<opc, VTInfo.info512, OpcodeStr>,
	expand_by_vec_width_lowering<VTInfo.info512>, EVEX_V512;

	let Predicates = [HasVLX] in {
	defm Z256 : expand_by_vec_width<opc, VTInfo.info256, OpcodeStr>,
	expand_by_vec_width_lowering<VTInfo.info256>, EVEX_V256;
	defm Z128 : expand_by_vec_width<opc, VTInfo.info128, OpcodeStr>,
	expand_by_vec_width_lowering<VTInfo.info128>, EVEX_V128;
	}
	}

	defm VPEXPANDD : expand_by_elt_width <0x89, "vpexpandd", avx512vl_i32_info>,
	EVEX;
	defm VPEXPANDQ : expand_by_elt_width <0x89, "vpexpandq", avx512vl_i64_info>,
	EVEX, VEX_W;
	defm VEXPANDPS : expand_by_elt_width <0x88, "vexpandps", avx512vl_f32_info>,
	EVEX;
	defm VEXPANDPD : expand_by_elt_width <0x88, "vexpandpd", avx512vl_f64_info>,
	EVEX, VEX_W;

	//handle instruction reg_vec1 = op(reg_vec,imm)
	// op(mem_vec,imm)
	// op(broadcast(eltVt),imm)
	//all instruction created with FROUND_CURRENT
	multiclass avx512_unary_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
	X86VectorVTInfo _>{
	let ExeDomain = _.ExeDomain in {
	defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
	(ins _.RC:$src1, i32u8imm:$src2),
	OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2",
	(OpNode (_.VT _.RC:$src1),
	(i32 imm:$src2),
	(i32 FROUND_CURRENT))>;
	defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
	(ins _.MemOp:$src1, i32u8imm:$src2),
	OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2",
	(OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
	(i32 imm:$src2),
	(i32 FROUND_CURRENT))>;
	defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
	(ins _.ScalarMemOp:$src1, i32u8imm:$src2),
	OpcodeStr##_.Suffix, "$src2, ${src1}"##_.BroadcastStr,
	"${src1}"##_.BroadcastStr##", $src2",
	(OpNode (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src1))),
	(i32 imm:$src2),
	(i32 FROUND_CURRENT))>, EVEX_B;
	}
	}

	//handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae}
	multiclass avx512_unary_fp_sae_packed_imm<bits<8> opc, string OpcodeStr,
	SDNode OpNode, X86VectorVTInfo _>{
	let ExeDomain = _.ExeDomain in
	defm rrib : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
	(ins _.RC:$src1, i32u8imm:$src2),
	OpcodeStr##_.Suffix, "$src2, {sae}, $src1",
	"$src1, {sae}, $src2",
	(OpNode (_.VT _.RC:$src1),
	(i32 imm:$src2),
	(i32 FROUND_NO_EXC))>, EVEX_B;
	}

	multiclass avx512_common_unary_fp_sae_packed_imm<string OpcodeStr,
	AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode, Predicate prd>{
	let Predicates = [prd] in {
	defm Z : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, _.info512>,
	avx512_unary_fp_sae_packed_imm<opc, OpcodeStr, OpNode, _.info512>,
	EVEX_V512;
	}
	let Predicates = [prd, HasVLX] in {
	defm Z128 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, _.info128>,
	EVEX_V128;
	defm Z256 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, _.info256>,
	EVEX_V256;
	}
	}

	//handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm)
	// op(reg_vec2,mem_vec,imm)
	// op(reg_vec2,broadcast(eltVt),imm)
	//all instruction created with FROUND_CURRENT
	multiclass avx512_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
	X86VectorVTInfo _>{
	let ExeDomain = _.ExeDomain in {
	defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
	(ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3),
	OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
	(OpNode (_.VT _.RC:$src1),
	(_.VT _.RC:$src2),
	(i32 imm:$src3),
	(i32 FROUND_CURRENT))>;
	defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
	(ins _.RC:$src1, _.MemOp:$src2, i32u8imm:$src3),
	OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
	(OpNode (_.VT _.RC:$src1),
	(_.VT (bitconvert (_.LdFrag addr:$src2))),
	(i32 imm:$src3),
	(i32 FROUND_CURRENT))>;
	defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
	(ins _.RC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3),
	OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1",
	"$src1, ${src2}"##_.BroadcastStr##", $src3",
	(OpNode (_.VT _.RC:$src1),
	(_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
	(i32 imm:$src3),
	(i32 FROUND_CURRENT))>, EVEX_B;
	}
	}

	//handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm)
	// op(reg_vec2,mem_vec,imm)
	multiclass avx512_3Op_rm_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode,
	X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo>{
	let ExeDomain = DestInfo.ExeDomain in {
	defm rri : AVX512_maskable<opc, MRMSrcReg, DestInfo, (outs DestInfo.RC:$dst),
	(ins SrcInfo.RC:$src1, SrcInfo.RC:$src2, u8imm:$src3),
	OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
	(DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1),
	(SrcInfo.VT SrcInfo.RC:$src2),
	(i8 imm:$src3)))>;
	defm rmi : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst),
	(ins SrcInfo.RC:$src1, SrcInfo.MemOp:$src2, u8imm:$src3),
	OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
	(DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1),
	(SrcInfo.VT (bitconvert
	(SrcInfo.LdFrag addr:$src2))),
	(i8 imm:$src3)))>;
	}
	}

	//handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm)
	// op(reg_vec2,mem_vec,imm)
	// op(reg_vec2,broadcast(eltVt),imm)
	multiclass avx512_3Op_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode,
	X86VectorVTInfo _>:
	avx512_3Op_rm_imm8<opc, OpcodeStr, OpNode, _, _>{

	let ExeDomain = _.ExeDomain in
	defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
	(ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3),
	OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1",
	"$src1, ${src2}"##_.BroadcastStr##", $src3",
	(OpNode (_.VT _.RC:$src1),
	(_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
	(i8 imm:$src3))>, EVEX_B;
	}

	//handle scalar instruction reg_vec1 = op(reg_vec2,reg_vec3,imm)
	// op(reg_vec2,mem_scalar,imm)
	//all instruction created with FROUND_CURRENT
	multiclass avx512_fp_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
	X86VectorVTInfo _> {
	let ExeDomain = _.ExeDomain in {
	defm rri : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
	(ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3),
	OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
	(OpNode (_.VT _.RC:$src1),
	(_.VT _.RC:$src2),
	(i32 imm:$src3),
	(i32 FROUND_CURRENT))>;
	defm rmi : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
	(ins _.RC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3),
	OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
	(OpNode (_.VT _.RC:$src1),
	(_.VT (scalar_to_vector
	(_.ScalarLdFrag addr:$src2))),
	(i32 imm:$src3),
	(i32 FROUND_CURRENT))>;
	}
	}

	//handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae}
	multiclass avx512_fp_sae_packed_imm<bits<8> opc, string OpcodeStr,
	SDNode OpNode, X86VectorVTInfo _>{
	let ExeDomain = _.ExeDomain in
	defm rrib : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
	(ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3),
	OpcodeStr, "$src3, {sae}, $src2, $src1",
	"$src1, $src2, {sae}, $src3",
	(OpNode (_.VT _.RC:$src1),
	(_.VT _.RC:$src2),
	(i32 imm:$src3),
	(i32 FROUND_NO_EXC))>, EVEX_B;
	}
	//handle scalar instruction reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae}
	multiclass avx512_fp_sae_scalar_imm<bits<8> opc, string OpcodeStr,
	SDNode OpNode, X86VectorVTInfo _> {
	let ExeDomain = _.ExeDomain in
	defm NAME#rrib : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
	(ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3),
	OpcodeStr, "$src3, {sae}, $src2, $src1",
	"$src1, $src2, {sae}, $src3",
	(OpNode (_.VT _.RC:$src1),
	(_.VT _.RC:$src2),
	(i32 imm:$src3),
	(i32 FROUND_NO_EXC))>, EVEX_B;
	}

	multiclass avx512_common_fp_sae_packed_imm<string OpcodeStr,
	AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode, Predicate prd>{
	let Predicates = [prd] in {
	defm Z : avx512_fp_packed_imm<opc, OpcodeStr, OpNode, _.info512>,
	avx512_fp_sae_packed_imm<opc, OpcodeStr, OpNode, _.info512>,
	EVEX_V512;

	}
	let Predicates = [prd, HasVLX] in {
	defm Z128 : avx512_fp_packed_imm<opc, OpcodeStr, OpNode, _.info128>,
	EVEX_V128;
	defm Z256 : avx512_fp_packed_imm<opc, OpcodeStr, OpNode, _.info256>,
	EVEX_V256;
	}
	}

	multiclass avx512_common_3Op_rm_imm8<bits<8> opc, SDNode OpNode, string OpStr,
	AVX512VLVectorVTInfo DestInfo, AVX512VLVectorVTInfo SrcInfo>{
	let Predicates = [HasBWI] in {
	defm Z : avx512_3Op_rm_imm8<opc, OpStr, OpNode, DestInfo.info512,
	SrcInfo.info512>, EVEX_V512, AVX512AIi8Base, EVEX_4V;
	}
	let Predicates = [HasBWI, HasVLX] in {
	defm Z128 : avx512_3Op_rm_imm8<opc, OpStr, OpNode, DestInfo.info128,
	SrcInfo.info128>, EVEX_V128, AVX512AIi8Base, EVEX_4V;
	defm Z256 : avx512_3Op_rm_imm8<opc, OpStr, OpNode, DestInfo.info256,
	SrcInfo.info256>, EVEX_V256, AVX512AIi8Base, EVEX_4V;
	}
	}

	multiclass avx512_common_3Op_imm8<string OpcodeStr, AVX512VLVectorVTInfo _,
	bits<8> opc, SDNode OpNode>{
	let Predicates = [HasAVX512] in {
	defm Z : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info512>, EVEX_V512;
	}
	let Predicates = [HasAVX512, HasVLX] in {
	defm Z128 : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info128>, EVEX_V128;
	defm Z256 : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info256>, EVEX_V256;
	}
	}

	multiclass avx512_common_fp_sae_scalar_imm<string OpcodeStr,
	X86VectorVTInfo _, bits<8> opc, SDNode OpNode, Predicate prd>{
	let Predicates = [prd] in {
	defm Z128 : avx512_fp_scalar_imm<opc, OpcodeStr, OpNode, _>,
	avx512_fp_sae_scalar_imm<opc, OpcodeStr, OpNode, _>;
	}
	}

	multiclass avx512_common_unary_fp_sae_packed_imm_all<string OpcodeStr,
	bits<8> opcPs, bits<8> opcPd, SDNode OpNode, Predicate prd>{
	defm PS : avx512_common_unary_fp_sae_packed_imm<OpcodeStr, avx512vl_f32_info,
	opcPs, OpNode, prd>, EVEX_CD8<32, CD8VF>;
	defm PD : avx512_common_unary_fp_sae_packed_imm<OpcodeStr, avx512vl_f64_info,
	opcPd, OpNode, prd>, EVEX_CD8<64, CD8VF>, VEX_W;
	}


	defm VREDUCE : avx512_common_unary_fp_sae_packed_imm_all<"vreduce", 0x56, 0x56,
	X86VReduce, HasDQI>, AVX512AIi8Base, EVEX;
	defm VRNDSCALE : avx512_common_unary_fp_sae_packed_imm_all<"vrndscale", 0x08, 0x09,
	X86VRndScale, HasAVX512>, AVX512AIi8Base, EVEX;
	defm VGETMANT : avx512_common_unary_fp_sae_packed_imm_all<"vgetmant", 0x26, 0x26,
	X86VGetMant, HasAVX512>, AVX512AIi8Base, EVEX;


	defm VRANGEPD : avx512_common_fp_sae_packed_imm<"vrangepd", avx512vl_f64_info,
	0x50, X86VRange, HasDQI>,
	AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
	defm VRANGEPS : avx512_common_fp_sae_packed_imm<"vrangeps", avx512vl_f32_info,
	0x50, X86VRange, HasDQI>,
	AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;

	defm VRANGESD: avx512_common_fp_sae_scalar_imm<"vrangesd", f64x_info,
	0x51, X86VRange, HasDQI>,
	AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
	defm VRANGESS: avx512_common_fp_sae_scalar_imm<"vrangess", f32x_info,
	0x51, X86VRange, HasDQI>,
	AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;

	defm VREDUCESD: avx512_common_fp_sae_scalar_imm<"vreducesd", f64x_info,
	0x57, X86Reduces, HasDQI>,
	AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
	defm VREDUCESS: avx512_common_fp_sae_scalar_imm<"vreducess", f32x_info,
	0x57, X86Reduces, HasDQI>,
	AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;

	defm VGETMANTSD: avx512_common_fp_sae_scalar_imm<"vgetmantsd", f64x_info,
	0x27, X86GetMants, HasAVX512>,
	AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
	defm VGETMANTSS: avx512_common_fp_sae_scalar_imm<"vgetmantss", f32x_info,
	0x27, X86GetMants, HasAVX512>,
	AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;

	multiclass avx512_shuff_packed_128<string OpcodeStr, AVX512VLVectorVTInfo _,
	bits<8> opc, SDNode OpNode = X86Shuf128>{
	let Predicates = [HasAVX512] in {
	defm Z : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info512>, EVEX_V512;

	}
	let Predicates = [HasAVX512, HasVLX] in {
	defm Z256 : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info256>, EVEX_V256;
	}
	}
	let Predicates = [HasAVX512] in {
	def : Pat<(v16f32 (ffloor VR512:$src)),
	(VRNDSCALEPSZrri VR512:$src, (i32 0x9))>;
	def : Pat<(v16f32 (fnearbyint VR512:$src)),
	(VRNDSCALEPSZrri VR512:$src, (i32 0xC))>;
	def : Pat<(v16f32 (fceil VR512:$src)),
	(VRNDSCALEPSZrri VR512:$src, (i32 0xA))>;
	def : Pat<(v16f32 (frint VR512:$src)),
	(VRNDSCALEPSZrri VR512:$src, (i32 0x4))>;
	def : Pat<(v16f32 (ftrunc VR512:$src)),
	(VRNDSCALEPSZrri VR512:$src, (i32 0xB))>;

	def : Pat<(v8f64 (ffloor VR512:$src)),
	(VRNDSCALEPDZrri VR512:$src, (i32 0x9))>;
	def : Pat<(v8f64 (fnearbyint VR512:$src)),
	(VRNDSCALEPDZrri VR512:$src, (i32 0xC))>;
	def : Pat<(v8f64 (fceil VR512:$src)),
	(VRNDSCALEPDZrri VR512:$src, (i32 0xA))>;
	def : Pat<(v8f64 (frint VR512:$src)),
	(VRNDSCALEPDZrri VR512:$src, (i32 0x4))>;
	def : Pat<(v8f64 (ftrunc VR512:$src)),
	(VRNDSCALEPDZrri VR512:$src, (i32 0xB))>;
	}

	defm VSHUFF32X4 : avx512_shuff_packed_128<"vshuff32x4",avx512vl_f32_info, 0x23>,
	AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
	defm VSHUFF64X2 : avx512_shuff_packed_128<"vshuff64x2",avx512vl_f64_info, 0x23>,
	AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
	defm VSHUFI32X4 : avx512_shuff_packed_128<"vshufi32x4",avx512vl_i32_info, 0x43>,
	AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
	defm VSHUFI64X2 : avx512_shuff_packed_128<"vshufi64x2",avx512vl_i64_info, 0x43>,
	AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;

	let Predicates = [HasAVX512] in {
	// Provide fallback in case the load node that is used in the broadcast
	// patterns above is used by additional users, which prevents the pattern
	// selection.
	def : Pat<(v8f64 (X86SubVBroadcast (v2f64 VR128X:$src))),
	(VSHUFF64X2Zrri (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
	(INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
	0)>;
	def : Pat<(v8i64 (X86SubVBroadcast (v2i64 VR128X:$src))),
	(VSHUFI64X2Zrri (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
	(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
	0)>;

	def : Pat<(v16f32 (X86SubVBroadcast (v4f32 VR128X:$src))),
	(VSHUFF32X4Zrri (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
	(INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
	0)>;
	def : Pat<(v16i32 (X86SubVBroadcast (v4i32 VR128X:$src))),
	(VSHUFI32X4Zrri (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
	(INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
	0)>;

	def : Pat<(v32i16 (X86SubVBroadcast (v8i16 VR128X:$src))),
	(VSHUFI32X4Zrri (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
	(INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
	0)>;

	def : Pat<(v64i8 (X86SubVBroadcast (v16i8 VR128X:$src))),
	(VSHUFI32X4Zrri (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
	(INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
	0)>;
	}

	multiclass avx512_valign<string OpcodeStr, AVX512VLVectorVTInfo VTInfo_I> {
	defm NAME: avx512_common_3Op_imm8<OpcodeStr, VTInfo_I, 0x03, X86VAlign>,
	AVX512AIi8Base, EVEX_4V;
	}

	defm VALIGND: avx512_valign<"valignd", avx512vl_i32_info>,
	EVEX_CD8<32, CD8VF>;
	defm VALIGNQ: avx512_valign<"valignq", avx512vl_i64_info>,
	EVEX_CD8<64, CD8VF>, VEX_W;

	multiclass avx512_vpalignr_lowering<X86VectorVTInfo _ , list<Predicate> p>{
	let Predicates = p in
	def NAME#_.VTName#rri:
	Pat<(_.VT (X86PAlignr _.RC:$src1, _.RC:$src2, (i8 imm:$imm))),
	(!cast<Instruction>(NAME#_.ZSuffix#rri)
	_.RC:$src1, _.RC:$src2, imm:$imm)>;
	}

	multiclass avx512_vpalignr_lowering_common<AVX512VLVectorVTInfo _>:
	avx512_vpalignr_lowering<_.info512, [HasBWI]>,
	avx512_vpalignr_lowering<_.info128, [HasBWI, HasVLX]>,
	avx512_vpalignr_lowering<_.info256, [HasBWI, HasVLX]>;

	defm VPALIGNR: avx512_common_3Op_rm_imm8<0x0F, X86PAlignr, "vpalignr" ,
	avx512vl_i8_info, avx512vl_i8_info>,
	avx512_vpalignr_lowering_common<avx512vl_i16_info>,
	avx512_vpalignr_lowering_common<avx512vl_i32_info>,
	avx512_vpalignr_lowering_common<avx512vl_f32_info>,
	avx512_vpalignr_lowering_common<avx512vl_i64_info>,
	avx512_vpalignr_lowering_common<avx512vl_f64_info>,
	EVEX_CD8<8, CD8VF>;

	defm VDBPSADBW: avx512_common_3Op_rm_imm8<0x42, X86dbpsadbw, "vdbpsadbw" ,
	avx512vl_i16_info, avx512vl_i8_info>, EVEX_CD8<8, CD8VF>;

	multiclass avx512_unary_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
	X86VectorVTInfo _> {
	let ExeDomain = _.ExeDomain in {
	defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
	(ins _.RC:$src1), OpcodeStr,
	"$src1", "$src1",
	(_.VT (OpNode _.RC:$src1))>, EVEX, AVX5128IBase;

	defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
	(ins _.MemOp:$src1), OpcodeStr,
	"$src1", "$src1",
	(_.VT (OpNode (bitconvert (_.LdFrag addr:$src1))))>,
	EVEX, AVX5128IBase, EVEX_CD8<_.EltSize, CD8VF>;
	}
	}

	multiclass avx512_unary_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
	X86VectorVTInfo _> :
	avx512_unary_rm<opc, OpcodeStr, OpNode, _> {
	defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
	(ins _.ScalarMemOp:$src1), OpcodeStr,
	"${src1}"##_.BroadcastStr,
	"${src1}"##_.BroadcastStr,
	(_.VT (OpNode (X86VBroadcast
	(_.ScalarLdFrag addr:$src1))))>,
	EVEX, AVX5128IBase, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>;
	}

	multiclass avx512_unary_rm_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
	AVX512VLVectorVTInfo VTInfo, Predicate prd> {
	let Predicates = [prd] in
	defm Z : avx512_unary_rm<opc, OpcodeStr, OpNode, VTInfo.info512>, EVEX_V512;

	let Predicates = [prd, HasVLX] in {
	defm Z256 : avx512_unary_rm<opc, OpcodeStr, OpNode, VTInfo.info256>,
	EVEX_V256;
	defm Z128 : avx512_unary_rm<opc, OpcodeStr, OpNode, VTInfo.info128>,
	EVEX_V128;
	}
	}

	multiclass avx512_unary_rmb_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
	AVX512VLVectorVTInfo VTInfo, Predicate prd> {
	let Predicates = [prd] in
	defm Z : avx512_unary_rmb<opc, OpcodeStr, OpNode, VTInfo.info512>,
	EVEX_V512;

	let Predicates = [prd, HasVLX] in {
	defm Z256 : avx512_unary_rmb<opc, OpcodeStr, OpNode, VTInfo.info256>,
	EVEX_V256;
	defm Z128 : avx512_unary_rmb<opc, OpcodeStr, OpNode, VTInfo.info128>,
	EVEX_V128;
	}
	}

	multiclass avx512_unary_rm_vl_dq<bits<8> opc_d, bits<8> opc_q, string OpcodeStr,
	SDNode OpNode, Predicate prd> {
	defm Q : avx512_unary_rmb_vl<opc_q, OpcodeStr#"q", OpNode, avx512vl_i64_info,
	prd>, VEX_W;
	defm D : avx512_unary_rmb_vl<opc_d, OpcodeStr#"d", OpNode, avx512vl_i32_info,
	prd>;
	}

	multiclass avx512_unary_rm_vl_bw<bits<8> opc_b, bits<8> opc_w, string OpcodeStr,
	SDNode OpNode, Predicate prd> {
	defm W : avx512_unary_rm_vl<opc_w, OpcodeStr#"w", OpNode, avx512vl_i16_info, prd>;
	defm B : avx512_unary_rm_vl<opc_b, OpcodeStr#"b", OpNode, avx512vl_i8_info, prd>;
	}

	multiclass avx512_unary_rm_vl_all<bits<8> opc_b, bits<8> opc_w,
	bits<8> opc_d, bits<8> opc_q,
	string OpcodeStr, SDNode OpNode> {
	defm NAME : avx512_unary_rm_vl_dq<opc_d, opc_q, OpcodeStr, OpNode,
	HasAVX512>,
	avx512_unary_rm_vl_bw<opc_b, opc_w, OpcodeStr, OpNode,
	HasBWI>;
	}

	defm VPABS : avx512_unary_rm_vl_all<0x1C, 0x1D, 0x1E, 0x1F, "vpabs", abs>;

	// VPABS: Use 512bit version to implement 128/256 bit in case NoVLX.
	let Predicates = [HasAVX512, NoVLX] in {
	def : Pat<(v4i64 (abs VR256X:$src)),
	(EXTRACT_SUBREG
	(VPABSQZrr
	(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)),
	sub_ymm)>;
	def : Pat<(v2i64 (abs VR128X:$src)),
	(EXTRACT_SUBREG
	(VPABSQZrr
	(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)),
	sub_xmm)>;
	}

	multiclass avx512_ctlz<bits<8> opc, string OpcodeStr, Predicate prd>{

	defm NAME : avx512_unary_rm_vl_dq<opc, opc, OpcodeStr, ctlz, prd>;
	}

	defm VPLZCNT : avx512_ctlz<0x44, "vplzcnt", HasCDI>;
	defm VPCONFLICT : avx512_unary_rm_vl_dq<0xC4, 0xC4, "vpconflict", X86Conflict, HasCDI>;

	// VPLZCNT: Use 512bit version to implement 128/256 bit in case NoVLX.
	let Predicates = [HasCDI, NoVLX] in {
	def : Pat<(v4i64 (ctlz VR256X:$src)),
	(EXTRACT_SUBREG
	(VPLZCNTQZrr
	(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)),
	sub_ymm)>;
	def : Pat<(v2i64 (ctlz VR128X:$src)),
	(EXTRACT_SUBREG
	(VPLZCNTQZrr
	(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)),
	sub_xmm)>;

	def : Pat<(v8i32 (ctlz VR256X:$src)),
	(EXTRACT_SUBREG
	(VPLZCNTDZrr
	(INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)),
	sub_ymm)>;
	def : Pat<(v4i32 (ctlz VR128X:$src)),
	(EXTRACT_SUBREG
	(VPLZCNTDZrr
	(INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)),
	sub_xmm)>;
	}

	//===---------------------------------------------------------------------===//
	// Counts number of ones - VPOPCNTD and VPOPCNTQ
	//===---------------------------------------------------------------------===//

	multiclass avx512_unary_rmb_popcnt<bits<8> opc, string OpcodeStr, X86VectorVTInfo VTInfo> {
	let Predicates = [HasVPOPCNTDQ] in
	defm Z : avx512_unary_rmb<opc, OpcodeStr, ctpop, VTInfo>, EVEX_V512;
	}

	// Use 512bit version to implement 128/256 bit.
	multiclass avx512_unary_lowering<SDNode OpNode, AVX512VLVectorVTInfo _, Predicate prd> {
	let Predicates = [prd] in {
	def Z256_Alt : Pat<(_.info256.VT(OpNode _.info256.RC:$src1)),
	(EXTRACT_SUBREG
	(!cast<Instruction>(NAME # "Zrr")
	(INSERT_SUBREG(_.info512.VT(IMPLICIT_DEF)),
	_.info256.RC:$src1,
	_.info256.SubRegIdx)),
	_.info256.SubRegIdx)>;

	def Z128_Alt : Pat<(_.info128.VT(OpNode _.info128.RC:$src1)),
	(EXTRACT_SUBREG
	(!cast<Instruction>(NAME # "Zrr")
	(INSERT_SUBREG(_.info512.VT(IMPLICIT_DEF)),
	_.info128.RC:$src1,
	_.info128.SubRegIdx)),
	_.info128.SubRegIdx)>;
	}
	}

	defm VPOPCNTD : avx512_unary_rmb_popcnt<0x55, "vpopcntd", v16i32_info>,
	avx512_unary_lowering<ctpop, avx512vl_i32_info, HasVPOPCNTDQ>;
	defm VPOPCNTQ : avx512_unary_rmb_popcnt<0x55, "vpopcntq", v8i64_info>,
	avx512_unary_lowering<ctpop, avx512vl_i64_info, HasVPOPCNTDQ>, VEX_W;

	//===---------------------------------------------------------------------===//
	// Replicate Single FP - MOVSHDUP and MOVSLDUP
	//===---------------------------------------------------------------------===//
	multiclass avx512_replicate<bits<8> opc, string OpcodeStr, SDNode OpNode>{
	defm NAME: avx512_unary_rm_vl<opc, OpcodeStr, OpNode, avx512vl_f32_info,
	HasAVX512>, XS;
	}

	defm VMOVSHDUP : avx512_replicate<0x16, "vmovshdup", X86Movshdup>;
	defm VMOVSLDUP : avx512_replicate<0x12, "vmovsldup", X86Movsldup>;

	//===----------------------------------------------------------------------===//
	// AVX-512 - MOVDDUP
	//===----------------------------------------------------------------------===//

	multiclass avx512_movddup_128<bits<8> opc, string OpcodeStr, SDNode OpNode,
	X86VectorVTInfo _> {
	let ExeDomain = _.ExeDomain in {
	defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
	(ins _.RC:$src), OpcodeStr, "$src", "$src",
	(_.VT (OpNode (_.VT _.RC:$src)))>, EVEX;
	defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
	(ins _.ScalarMemOp:$src), OpcodeStr, "$src", "$src",
	(_.VT (OpNode (_.VT (scalar_to_vector
	(_.ScalarLdFrag addr:$src)))))>,
	EVEX, EVEX_CD8<_.EltSize, CD8VH>;
	}
	}

	multiclass avx512_movddup_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
	AVX512VLVectorVTInfo VTInfo> {

	defm Z : avx512_unary_rm<opc, OpcodeStr, OpNode, VTInfo.info512>, EVEX_V512;

	let Predicates = [HasAVX512, HasVLX] in {
	defm Z256 : avx512_unary_rm<opc, OpcodeStr, OpNode, VTInfo.info256>,
	EVEX_V256;
	defm Z128 : avx512_movddup_128<opc, OpcodeStr, OpNode, VTInfo.info128>,
	EVEX_V128;
	}
	}

	multiclass avx512_movddup<bits<8> opc, string OpcodeStr, SDNode OpNode>{
	defm NAME: avx512_movddup_common<opc, OpcodeStr, OpNode,
	avx512vl_f64_info>, XD, VEX_W;
	}

	defm VMOVDDUP : avx512_movddup<0x12, "vmovddup", X86Movddup>;

	let Predicates = [HasVLX] in {
	def : Pat<(X86Movddup (loadv2f64 addr:$src)),
	(VMOVDDUPZ128rm addr:$src)>;
	def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))),
	(VMOVDDUPZ128rm addr:$src)>;
	def : Pat<(v2f64 (X86VBroadcast f64:$src)),
	(VMOVDDUPZ128rr (COPY_TO_REGCLASS FR64X:$src, VR128X))>;

	def : Pat<(vselect (v2i1 VK2WM:$mask), (X86Movddup (loadv2f64 addr:$src)),
	(v2f64 VR128X:$src0)),
	(VMOVDDUPZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
	def : Pat<(vselect (v2i1 VK2WM:$mask), (X86Movddup (loadv2f64 addr:$src)),
	(bitconvert (v4i32 immAllZerosV))),
	(VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>;

	def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)),
	(v2f64 VR128X:$src0)),
	(VMOVDDUPZ128rrk VR128X:$src0, VK2WM:$mask,
	(COPY_TO_REGCLASS FR64X:$src, VR128X))>;
	def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)),
	(bitconvert (v4i32 immAllZerosV))),
	(VMOVDDUPZ128rrkz VK2WM:$mask, (COPY_TO_REGCLASS FR64X:$src, VR128X))>;

	def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadf64 addr:$src))),
	(v2f64 VR128X:$src0)),
	(VMOVDDUPZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
	def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadf64 addr:$src))),
	(bitconvert (v4i32 immAllZerosV))),
	(VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>;
	}

	//===----------------------------------------------------------------------===//
	// AVX-512 - Unpack Instructions
	//===----------------------------------------------------------------------===//
	defm VUNPCKH : avx512_fp_binop_p<0x15, "vunpckh", X86Unpckh, HasAVX512,
	SSE_ALU_ITINS_S>;
	defm VUNPCKL : avx512_fp_binop_p<0x14, "vunpckl", X86Unpckl, HasAVX512,
	SSE_ALU_ITINS_S>;

	defm VPUNPCKLBW : avx512_binop_rm_vl_b<0x60, "vpunpcklbw", X86Unpckl,
	SSE_INTALU_ITINS_P, HasBWI>;
	defm VPUNPCKHBW : avx512_binop_rm_vl_b<0x68, "vpunpckhbw", X86Unpckh,
	SSE_INTALU_ITINS_P, HasBWI>;
	defm VPUNPCKLWD : avx512_binop_rm_vl_w<0x61, "vpunpcklwd", X86Unpckl,
	SSE_INTALU_ITINS_P, HasBWI>;
	defm VPUNPCKHWD : avx512_binop_rm_vl_w<0x69, "vpunpckhwd", X86Unpckh,
	SSE_INTALU_ITINS_P, HasBWI>;

	defm VPUNPCKLDQ : avx512_binop_rm_vl_d<0x62, "vpunpckldq", X86Unpckl,
	SSE_INTALU_ITINS_P, HasAVX512>;
	defm VPUNPCKHDQ : avx512_binop_rm_vl_d<0x6A, "vpunpckhdq", X86Unpckh,
	SSE_INTALU_ITINS_P, HasAVX512>;
	defm VPUNPCKLQDQ : avx512_binop_rm_vl_q<0x6C, "vpunpcklqdq", X86Unpckl,
	SSE_INTALU_ITINS_P, HasAVX512>;
	defm VPUNPCKHQDQ : avx512_binop_rm_vl_q<0x6D, "vpunpckhqdq", X86Unpckh,
	SSE_INTALU_ITINS_P, HasAVX512>;

	//===----------------------------------------------------------------------===//
	// AVX-512 - Extract & Insert Integer Instructions
	//===----------------------------------------------------------------------===//

	multiclass avx512_extract_elt_bw_m<bits<8> opc, string OpcodeStr, SDNode OpNode,
	X86VectorVTInfo _> {
	def mr : AVX512Ii8<opc, MRMDestMem, (outs),
	(ins _.ScalarMemOp:$dst, _.RC:$src1, u8imm:$src2),
	OpcodeStr#"\t{$src2, $src1, $dst\|$dst, $src1, $src2}",
	[(store (_.EltVT (trunc (assertzext (OpNode (_.VT _.RC:$src1),
	imm:$src2)))),
	addr:$dst)]>,
	EVEX, EVEX_CD8<_.EltSize, CD8VT1>;
	}

	multiclass avx512_extract_elt_b<string OpcodeStr, X86VectorVTInfo _> {
	let Predicates = [HasBWI] in {
	def rr : AVX512Ii8<0x14, MRMDestReg, (outs GR32orGR64:$dst),
	(ins _.RC:$src1, u8imm:$src2),
	OpcodeStr#"\t{$src2, $src1, $dst\|$dst, $src1, $src2}",
	[(set GR32orGR64:$dst,
	(X86pextrb (_.VT _.RC:$src1), imm:$src2))]>,
	EVEX, TAPD;

	defm NAME : avx512_extract_elt_bw_m<0x14, OpcodeStr, X86pextrb, _>, TAPD;
	}
	}

	multiclass avx512_extract_elt_w<string OpcodeStr, X86VectorVTInfo _> {
	let Predicates = [HasBWI] in {
	def rr : AVX512Ii8<0xC5, MRMSrcReg, (outs GR32orGR64:$dst),
	(ins _.RC:$src1, u8imm:$src2),
	OpcodeStr#"\t{$src2, $src1, $dst\|$dst, $src1, $src2}",
	[(set GR32orGR64:$dst,
	(X86pextrw (_.VT _.RC:$src1), imm:$src2))]>,
	EVEX, PD;

	let hasSideEffects = 0 in
	def rr_REV : AVX512Ii8<0x15, MRMDestReg, (outs GR32orGR64:$dst),
	(ins _.RC:$src1, u8imm:$src2),
	OpcodeStr#".s\t{$src2, $src1, $dst\|$dst, $src1, $src2}", []>,
	EVEX, TAPD, FoldGenData<NAME#rr>;

	defm NAME : avx512_extract_elt_bw_m<0x15, OpcodeStr, X86pextrw, _>, TAPD;
	}
	}

	multiclass avx512_extract_elt_dq<string OpcodeStr, X86VectorVTInfo _,
	RegisterClass GRC> {
	let Predicates = [HasDQI] in {
	def rr : AVX512Ii8<0x16, MRMDestReg, (outs GRC:$dst),
	(ins _.RC:$src1, u8imm:$src2),
	OpcodeStr#"\t{$src2, $src1, $dst\|$dst, $src1, $src2}",
	[(set GRC:$dst,
	(extractelt (_.VT _.RC:$src1), imm:$src2))]>,
	EVEX, TAPD;

	def mr : AVX512Ii8<0x16, MRMDestMem, (outs),
	(ins _.ScalarMemOp:$dst, _.RC:$src1, u8imm:$src2),
	OpcodeStr#"\t{$src2, $src1, $dst\|$dst, $src1, $src2}",
	[(store (extractelt (_.VT _.RC:$src1),
	imm:$src2),addr:$dst)]>,
	EVEX, EVEX_CD8<_.EltSize, CD8VT1>, TAPD;
	}
	}

	defm VPEXTRBZ : avx512_extract_elt_b<"vpextrb", v16i8x_info>;
	defm VPEXTRWZ : avx512_extract_elt_w<"vpextrw", v8i16x_info>;
	defm VPEXTRDZ : avx512_extract_elt_dq<"vpextrd", v4i32x_info, GR32>;
	defm VPEXTRQZ : avx512_extract_elt_dq<"vpextrq", v2i64x_info, GR64>, VEX_W;

	multiclass avx512_insert_elt_m<bits<8> opc, string OpcodeStr, SDNode OpNode,
	X86VectorVTInfo _, PatFrag LdFrag> {
	def rm : AVX512Ii8<opc, MRMSrcMem, (outs _.RC:$dst),
	(ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3),
	OpcodeStr#"\t{$src3, $src2, $src1, $dst\|$dst, $src1, $src2, $src3}",
	[(set _.RC:$dst,
	(_.VT (OpNode _.RC:$src1, (LdFrag addr:$src2), imm:$src3)))]>,
	EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>;
	}

	multiclass avx512_insert_elt_bw<bits<8> opc, string OpcodeStr, SDNode OpNode,
	X86VectorVTInfo _, PatFrag LdFrag> {
	let Predicates = [HasBWI] in {
	def rr : AVX512Ii8<opc, MRMSrcReg, (outs _.RC:$dst),
	(ins _.RC:$src1, GR32orGR64:$src2, u8imm:$src3),
	OpcodeStr#"\t{$src3, $src2, $src1, $dst\|$dst, $src1, $src2, $src3}",
	[(set _.RC:$dst,
	(OpNode _.RC:$src1, GR32orGR64:$src2, imm:$src3))]>, EVEX_4V;

	defm NAME : avx512_insert_elt_m<opc, OpcodeStr, OpNode, _, LdFrag>;
	}
	}

	multiclass avx512_insert_elt_dq<bits<8> opc, string OpcodeStr,
	X86VectorVTInfo _, RegisterClass GRC> {
	let Predicates = [HasDQI] in {
	def rr : AVX512Ii8<opc, MRMSrcReg, (outs _.RC:$dst),
	(ins _.RC:$src1, GRC:$src2, u8imm:$src3),
	OpcodeStr#"\t{$src3, $src2, $src1, $dst\|$dst, $src1, $src2, $src3}",
	[(set _.RC:$dst,
	(_.VT (insertelt _.RC:$src1, GRC:$src2, imm:$src3)))]>,
	EVEX_4V, TAPD;

	defm NAME : avx512_insert_elt_m<opc, OpcodeStr, insertelt, _,
	_.ScalarLdFrag>, TAPD;
	}
	}

	defm VPINSRBZ : avx512_insert_elt_bw<0x20, "vpinsrb", X86pinsrb, v16i8x_info,
	extloadi8>, TAPD;
	defm VPINSRWZ : avx512_insert_elt_bw<0xC4, "vpinsrw", X86pinsrw, v8i16x_info,
	extloadi16>, PD;
	defm VPINSRDZ : avx512_insert_elt_dq<0x22, "vpinsrd", v4i32x_info, GR32>;
	defm VPINSRQZ : avx512_insert_elt_dq<0x22, "vpinsrq", v2i64x_info, GR64>, VEX_W;
	//===----------------------------------------------------------------------===//
	// VSHUFPS - VSHUFPD Operations
	//===----------------------------------------------------------------------===//
	multiclass avx512_shufp<string OpcodeStr, AVX512VLVectorVTInfo VTInfo_I,
	AVX512VLVectorVTInfo VTInfo_FP>{
	defm NAME: avx512_common_3Op_imm8<OpcodeStr, VTInfo_FP, 0xC6, X86Shufp>,
	EVEX_CD8<VTInfo_FP.info512.EltSize, CD8VF>,
	AVX512AIi8Base, EVEX_4V;
	}

	defm VSHUFPS: avx512_shufp<"vshufps", avx512vl_i32_info, avx512vl_f32_info>, PS;
	defm VSHUFPD: avx512_shufp<"vshufpd", avx512vl_i64_info, avx512vl_f64_info>, PD, VEX_W;
	//===----------------------------------------------------------------------===//
	// AVX-512 - Byte shift Left/Right
	//===----------------------------------------------------------------------===//

	multiclass avx512_shift_packed<bits<8> opc, SDNode OpNode, Format MRMr,
	Format MRMm, string OpcodeStr, X86VectorVTInfo _>{
	def rr : AVX512<opc, MRMr,
	(outs _.RC:$dst), (ins _.RC:$src1, u8imm:$src2),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set _.RC:$dst,(_.VT (OpNode _.RC:$src1, (i8 imm:$src2))))]>;
	def rm : AVX512<opc, MRMm,
	(outs _.RC:$dst), (ins _.MemOp:$src1, u8imm:$src2),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set _.RC:$dst,(_.VT (OpNode
	(_.VT (bitconvert (_.LdFrag addr:$src1))),
	(i8 imm:$src2))))]>;
	}

	multiclass avx512_shift_packed_all<bits<8> opc, SDNode OpNode, Format MRMr,
	Format MRMm, string OpcodeStr, Predicate prd>{
	let Predicates = [prd] in
	defm Z512 : avx512_shift_packed<opc, OpNode, MRMr, MRMm,
	OpcodeStr, v64i8_info>, EVEX_V512;
	let Predicates = [prd, HasVLX] in {
	defm Z256 : avx512_shift_packed<opc, OpNode, MRMr, MRMm,
	OpcodeStr, v32i8x_info>, EVEX_V256;
	defm Z128 : avx512_shift_packed<opc, OpNode, MRMr, MRMm,
	OpcodeStr, v16i8x_info>, EVEX_V128;
	}
	}
	defm VPSLLDQ : avx512_shift_packed_all<0x73, X86vshldq, MRM7r, MRM7m, "vpslldq",
	HasBWI>, AVX512PDIi8Base, EVEX_4V;
	defm VPSRLDQ : avx512_shift_packed_all<0x73, X86vshrdq, MRM3r, MRM3m, "vpsrldq",
	HasBWI>, AVX512PDIi8Base, EVEX_4V;


	multiclass avx512_psadbw_packed<bits<8> opc, SDNode OpNode,
	string OpcodeStr, X86VectorVTInfo _dst,
	X86VectorVTInfo _src>{
	def rr : AVX512BI<opc, MRMSrcReg,
	(outs _dst.RC:$dst), (ins _src.RC:$src1, _src.RC:$src2),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set _dst.RC:$dst,(_dst.VT
	(OpNode (_src.VT _src.RC:$src1),
	(_src.VT _src.RC:$src2))))]>;
	def rm : AVX512BI<opc, MRMSrcMem,
	(outs _dst.RC:$dst), (ins _src.RC:$src1, _src.MemOp:$src2),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set _dst.RC:$dst,(_dst.VT
	(OpNode (_src.VT _src.RC:$src1),
	(_src.VT (bitconvert
	(_src.LdFrag addr:$src2))))))]>;
	}

	multiclass avx512_psadbw_packed_all<bits<8> opc, SDNode OpNode,
	string OpcodeStr, Predicate prd> {
	let Predicates = [prd] in
	defm Z512 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, v8i64_info,
	v64i8_info>, EVEX_V512;
	let Predicates = [prd, HasVLX] in {
	defm Z256 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, v4i64x_info,
	v32i8x_info>, EVEX_V256;
	defm Z128 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, v2i64x_info,
	v16i8x_info>, EVEX_V128;
	}
	}

	defm VPSADBW : avx512_psadbw_packed_all<0xf6, X86psadbw, "vpsadbw",
	HasBWI>, EVEX_4V;

	// Transforms to swizzle an immediate to enable better matching when
	// memory operand isn't in the right place.
	def VPTERNLOG321_imm8 : SDNodeXForm<imm, [{
	// Convert a VPTERNLOG immediate by swapping operand 0 and operand 2.
	uint8_t Imm = N->getZExtValue();
	// Swap bits 1/4 and 3/6.
	uint8_t NewImm = Imm & 0xa5;
	if (Imm & 0x02) NewImm \|= 0x10;
	if (Imm & 0x10) NewImm \|= 0x02;
	if (Imm & 0x08) NewImm \|= 0x40;
	if (Imm & 0x40) NewImm \|= 0x08;
	return getI8Imm(NewImm, SDLoc(N));
	}]>;
	def VPTERNLOG213_imm8 : SDNodeXForm<imm, [{
	// Convert a VPTERNLOG immediate by swapping operand 1 and operand 2.
	uint8_t Imm = N->getZExtValue();
	// Swap bits 2/4 and 3/5.
	uint8_t NewImm = Imm & 0xc3;
	if (Imm & 0x04) NewImm \|= 0x10;
	if (Imm & 0x10) NewImm \|= 0x04;
	if (Imm & 0x08) NewImm \|= 0x20;
	if (Imm & 0x20) NewImm \|= 0x08;
	return getI8Imm(NewImm, SDLoc(N));
	}]>;
	def VPTERNLOG132_imm8 : SDNodeXForm<imm, [{
	// Convert a VPTERNLOG immediate by swapping operand 1 and operand 2.
	uint8_t Imm = N->getZExtValue();
	// Swap bits 1/2 and 5/6.
	uint8_t NewImm = Imm & 0x99;
	if (Imm & 0x02) NewImm \|= 0x04;
	if (Imm & 0x04) NewImm \|= 0x02;
	if (Imm & 0x20) NewImm \|= 0x40;
	if (Imm & 0x40) NewImm \|= 0x20;
	return getI8Imm(NewImm, SDLoc(N));
	}]>;
	def VPTERNLOG231_imm8 : SDNodeXForm<imm, [{
	// Convert a VPTERNLOG immediate by moving operand 1 to the end.
	uint8_t Imm = N->getZExtValue();
	// Move bits 1->2, 2->4, 3->6, 4->1, 5->3, 6->5
	uint8_t NewImm = Imm & 0x81;
	if (Imm & 0x02) NewImm \|= 0x04;
	if (Imm & 0x04) NewImm \|= 0x10;
	if (Imm & 0x08) NewImm \|= 0x40;
	if (Imm & 0x10) NewImm \|= 0x02;
	if (Imm & 0x20) NewImm \|= 0x08;
	if (Imm & 0x40) NewImm \|= 0x20;
	return getI8Imm(NewImm, SDLoc(N));
	}]>;
	def VPTERNLOG312_imm8 : SDNodeXForm<imm, [{
	// Convert a VPTERNLOG immediate by moving operand 2 to the beginning.
	uint8_t Imm = N->getZExtValue();
	// Move bits 1->4, 2->1, 3->5, 4->2, 5->6, 6->3
	uint8_t NewImm = Imm & 0x81;
	if (Imm & 0x02) NewImm \|= 0x10;
	if (Imm & 0x04) NewImm \|= 0x02;
	if (Imm & 0x08) NewImm \|= 0x20;
	if (Imm & 0x10) NewImm \|= 0x04;
	if (Imm & 0x20) NewImm \|= 0x40;
	if (Imm & 0x40) NewImm \|= 0x08;
	return getI8Imm(NewImm, SDLoc(N));
	}]>;

	multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
	X86VectorVTInfo _>{
	let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
	defm rri : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
	(ins _.RC:$src2, _.RC:$src3, u8imm:$src4),
	OpcodeStr, "$src4, $src3, $src2", "$src2, $src3, $src4",
	(OpNode (_.VT _.RC:$src1),
	(_.VT _.RC:$src2),
	(_.VT _.RC:$src3),
	(i8 imm:$src4)), 1, 1>, AVX512AIi8Base, EVEX_4V;
	defm rmi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
	(ins _.RC:$src2, _.MemOp:$src3, u8imm:$src4),
	OpcodeStr, "$src4, $src3, $src2", "$src2, $src3, $src4",
	(OpNode (_.VT _.RC:$src1),
	(_.VT _.RC:$src2),
	(_.VT (bitconvert (_.LdFrag addr:$src3))),
	(i8 imm:$src4)), 1, 0>,
	AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
	defm rmbi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
	(ins _.RC:$src2, _.ScalarMemOp:$src3, u8imm:$src4),
	OpcodeStr, "$src4, ${src3}"##_.BroadcastStr##", $src2",
	"$src2, ${src3}"##_.BroadcastStr##", $src4",
	(OpNode (_.VT _.RC:$src1),
	(_.VT _.RC:$src2),
	(_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))),
	(i8 imm:$src4)), 1, 0>, EVEX_B,
	AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
	}// Constraints = "$src1 = $dst"

	// Additional patterns for matching passthru operand in other positions.
	def : Pat<(_.VT (vselect _.KRCWM:$mask,
	(OpNode _.RC:$src3, _.RC:$src2, _.RC:$src1, (i8 imm:$src4)),
	_.RC:$src1)),
	(!cast<Instruction>(NAME#_.ZSuffix#rrik) _.RC:$src1, _.KRCWM:$mask,
	_.RC:$src2, _.RC:$src3, (VPTERNLOG321_imm8 imm:$src4))>;
	def : Pat<(_.VT (vselect _.KRCWM:$mask,
	(OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i8 imm:$src4)),
	_.RC:$src1)),
	(!cast<Instruction>(NAME#_.ZSuffix#rrik) _.RC:$src1, _.KRCWM:$mask,
	_.RC:$src2, _.RC:$src3, (VPTERNLOG213_imm8 imm:$src4))>;

	// Additional patterns for matching loads in other positions.
	def : Pat<(_.VT (OpNode (bitconvert (_.LdFrag addr:$src3)),
	_.RC:$src2, _.RC:$src1, (i8 imm:$src4))),
	(!cast<Instruction>(NAME#_.ZSuffix#rmi) _.RC:$src1, _.RC:$src2,
	addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>;
	def : Pat<(_.VT (OpNode _.RC:$src1,
	(bitconvert (_.LdFrag addr:$src3)),
	_.RC:$src2, (i8 imm:$src4))),
	(!cast<Instruction>(NAME#_.ZSuffix#rmi) _.RC:$src1, _.RC:$src2,
	addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>;

	// Additional patterns for matching zero masking with loads in other
	// positions.
	def : Pat<(_.VT (vselect _.KRCWM:$mask,
	(OpNode (bitconvert (_.LdFrag addr:$src3)),
	_.RC:$src2, _.RC:$src1, (i8 imm:$src4)),
	_.ImmAllZerosV)),
	(!cast<Instruction>(NAME#_.ZSuffix#rmikz) _.RC:$src1, _.KRCWM:$mask,
	_.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>;
	def : Pat<(_.VT (vselect _.KRCWM:$mask,
	(OpNode _.RC:$src1, (bitconvert (_.LdFrag addr:$src3)),
	_.RC:$src2, (i8 imm:$src4)),
	_.ImmAllZerosV)),
	(!cast<Instruction>(NAME#_.ZSuffix#rmikz) _.RC:$src1, _.KRCWM:$mask,
	_.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>;

	// Additional patterns for matching masked loads with different
	// operand orders.
	def : Pat<(_.VT (vselect _.KRCWM:$mask,
	(OpNode _.RC:$src1, (bitconvert (_.LdFrag addr:$src3)),
	_.RC:$src2, (i8 imm:$src4)),
	_.RC:$src1)),
	(!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
	_.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>;
	def : Pat<(_.VT (vselect _.KRCWM:$mask,
	(OpNode (bitconvert (_.LdFrag addr:$src3)),
	_.RC:$src2, _.RC:$src1, (i8 imm:$src4)),
	_.RC:$src1)),
	(!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
	_.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>;
	def : Pat<(_.VT (vselect _.KRCWM:$mask,
	(OpNode _.RC:$src2, _.RC:$src1,
	(bitconvert (_.LdFrag addr:$src3)), (i8 imm:$src4)),
	_.RC:$src1)),
	(!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
	_.RC:$src2, addr:$src3, (VPTERNLOG213_imm8 imm:$src4))>;
	def : Pat<(_.VT (vselect _.KRCWM:$mask,
	(OpNode _.RC:$src2, (bitconvert (_.LdFrag addr:$src3)),
	_.RC:$src1, (i8 imm:$src4)),
	_.RC:$src1)),
	(!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
	_.RC:$src2, addr:$src3, (VPTERNLOG231_imm8 imm:$src4))>;
	def : Pat<(_.VT (vselect _.KRCWM:$mask,
	(OpNode (bitconvert (_.LdFrag addr:$src3)),
	_.RC:$src1, _.RC:$src2, (i8 imm:$src4)),
	_.RC:$src1)),
	(!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
	_.RC:$src2, addr:$src3, (VPTERNLOG312_imm8 imm:$src4))>;

	// Additional patterns for matching broadcasts in other positions.
	def : Pat<(_.VT (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
	_.RC:$src2, _.RC:$src1, (i8 imm:$src4))),
	(!cast<Instruction>(NAME#_.ZSuffix#rmbi) _.RC:$src1, _.RC:$src2,
	addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>;
	def : Pat<(_.VT (OpNode _.RC:$src1,
	(X86VBroadcast (_.ScalarLdFrag addr:$src3)),
	_.RC:$src2, (i8 imm:$src4))),
	(!cast<Instruction>(NAME#_.ZSuffix#rmbi) _.RC:$src1, _.RC:$src2,
	addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>;

	// Additional patterns for matching zero masking with broadcasts in other
	// positions.
	def : Pat<(_.VT (vselect _.KRCWM:$mask,
	(OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
	_.RC:$src2, _.RC:$src1, (i8 imm:$src4)),
	_.ImmAllZerosV)),
	(!cast<Instruction>(NAME#_.ZSuffix#rmbikz) _.RC:$src1,
	_.KRCWM:$mask, _.RC:$src2, addr:$src3,
	(VPTERNLOG321_imm8 imm:$src4))>;
	def : Pat<(_.VT (vselect _.KRCWM:$mask,
	(OpNode _.RC:$src1,
	(X86VBroadcast (_.ScalarLdFrag addr:$src3)),
	_.RC:$src2, (i8 imm:$src4)),
	_.ImmAllZerosV)),
	(!cast<Instruction>(NAME#_.ZSuffix#rmbikz) _.RC:$src1,
	_.KRCWM:$mask, _.RC:$src2, addr:$src3,
	(VPTERNLOG132_imm8 imm:$src4))>;

	// Additional patterns for matching masked broadcasts with different
	// operand orders.
	def : Pat<(_.VT (vselect _.KRCWM:$mask,
	(OpNode _.RC:$src1,
	(X86VBroadcast (_.ScalarLdFrag addr:$src3)),
	_.RC:$src2, (i8 imm:$src4)),
	_.RC:$src1)),
	(!cast<Instruction>(NAME#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask,
	_.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>;
	def : Pat<(_.VT (vselect _.KRCWM:$mask,
	(OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
	_.RC:$src2, _.RC:$src1, (i8 imm:$src4)),
	_.RC:$src1)),
	(!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
	_.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>;
	def : Pat<(_.VT (vselect _.KRCWM:$mask,
	(OpNode _.RC:$src2, _.RC:$src1,
	(X86VBroadcast (_.ScalarLdFrag addr:$src3)),
	(i8 imm:$src4)), _.RC:$src1)),
	(!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
	_.RC:$src2, addr:$src3, (VPTERNLOG213_imm8 imm:$src4))>;
	def : Pat<(_.VT (vselect _.KRCWM:$mask,
	(OpNode _.RC:$src2,
	(X86VBroadcast (_.ScalarLdFrag addr:$src3)),
	_.RC:$src1, (i8 imm:$src4)),
	_.RC:$src1)),
	(!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
	_.RC:$src2, addr:$src3, (VPTERNLOG231_imm8 imm:$src4))>;
	def : Pat<(_.VT (vselect _.KRCWM:$mask,
	(OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
	_.RC:$src1, _.RC:$src2, (i8 imm:$src4)),
	_.RC:$src1)),
	(!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
	_.RC:$src2, addr:$src3, (VPTERNLOG312_imm8 imm:$src4))>;
	}

	multiclass avx512_common_ternlog<string OpcodeStr, AVX512VLVectorVTInfo _>{
	let Predicates = [HasAVX512] in
	defm Z : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, _.info512>, EVEX_V512;
	let Predicates = [HasAVX512, HasVLX] in {
	defm Z128 : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, _.info128>, EVEX_V128;
	defm Z256 : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, _.info256>, EVEX_V256;
	}
	}

	defm VPTERNLOGD : avx512_common_ternlog<"vpternlogd", avx512vl_i32_info>;
	defm VPTERNLOGQ : avx512_common_ternlog<"vpternlogq", avx512vl_i64_info>, VEX_W;

	//===----------------------------------------------------------------------===//
	// AVX-512 - FixupImm
	//===----------------------------------------------------------------------===//

	multiclass avx512_fixupimm_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
	X86VectorVTInfo _>{
	let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
	defm rri : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
	(ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4),
	OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
	(OpNode (_.VT _.RC:$src1),
	(_.VT _.RC:$src2),
	(_.IntVT _.RC:$src3),
	(i32 imm:$src4),
	(i32 FROUND_CURRENT))>;
	defm rmi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
	(ins _.RC:$src2, _.MemOp:$src3, i32u8imm:$src4),
	OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
	(OpNode (_.VT _.RC:$src1),
	(_.VT _.RC:$src2),
	(_.IntVT (bitconvert (_.LdFrag addr:$src3))),
	(i32 imm:$src4),
	(i32 FROUND_CURRENT))>;
	defm rmbi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
	(ins _.RC:$src2, _.ScalarMemOp:$src3, i32u8imm:$src4),
	OpcodeStr##_.Suffix, "$src4, ${src3}"##_.BroadcastStr##", $src2",
	"$src2, ${src3}"##_.BroadcastStr##", $src4",
	(OpNode (_.VT _.RC:$src1),
	(_.VT _.RC:$src2),
	(_.IntVT (X86VBroadcast(_.ScalarLdFrag addr:$src3))),
	(i32 imm:$src4),
	(i32 FROUND_CURRENT))>, EVEX_B;
	} // Constraints = "$src1 = $dst"
	}

	multiclass avx512_fixupimm_packed_sae<bits<8> opc, string OpcodeStr,
	SDNode OpNode, X86VectorVTInfo _>{
	let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
	defm rrib : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
	(ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4),
	OpcodeStr##_.Suffix, "$src4, {sae}, $src3, $src2",
	"$src2, $src3, {sae}, $src4",
	(OpNode (_.VT _.RC:$src1),
	(_.VT _.RC:$src2),
	(_.IntVT _.RC:$src3),
	(i32 imm:$src4),
	(i32 FROUND_NO_EXC))>, EVEX_B;
	}
	}

	multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
	X86VectorVTInfo _, X86VectorVTInfo _src3VT> {
	let Constraints = "$src1 = $dst" , Predicates = [HasAVX512],
	ExeDomain = _.ExeDomain in {
	defm rri : AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
	(ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4),
	OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
	(OpNode (_.VT _.RC:$src1),
	(_.VT _.RC:$src2),
	(_src3VT.VT _src3VT.RC:$src3),
	(i32 imm:$src4),
	(i32 FROUND_CURRENT))>;

	defm rrib : AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
	(ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4),
	OpcodeStr##_.Suffix, "$src4, {sae}, $src3, $src2",
	"$src2, $src3, {sae}, $src4",
	(OpNode (_.VT _.RC:$src1),
	(_.VT _.RC:$src2),
	(_src3VT.VT _src3VT.RC:$src3),
	(i32 imm:$src4),
	(i32 FROUND_NO_EXC))>, EVEX_B;
	defm rmi : AVX512_maskable_3src_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
	(ins _.RC:$src2, _.ScalarMemOp:$src3, i32u8imm:$src4),
	OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
	(OpNode (_.VT _.RC:$src1),
	(_.VT _.RC:$src2),
	(_src3VT.VT (scalar_to_vector
	(_src3VT.ScalarLdFrag addr:$src3))),
	(i32 imm:$src4),
	(i32 FROUND_CURRENT))>;
	}
	}

	multiclass avx512_fixupimm_packed_all<AVX512VLVectorVTInfo _Vec>{
	let Predicates = [HasAVX512] in
	defm Z : avx512_fixupimm_packed<0x54, "vfixupimm", X86VFixupimm, _Vec.info512>,
	avx512_fixupimm_packed_sae<0x54, "vfixupimm", X86VFixupimm, _Vec.info512>,
	AVX512AIi8Base, EVEX_4V, EVEX_V512;
	let Predicates = [HasAVX512, HasVLX] in {
	defm Z128 : avx512_fixupimm_packed<0x54, "vfixupimm", X86VFixupimm, _Vec.info128>,
	AVX512AIi8Base, EVEX_4V, EVEX_V128;
	defm Z256 : avx512_fixupimm_packed<0x54, "vfixupimm", X86VFixupimm, _Vec.info256>,
	AVX512AIi8Base, EVEX_4V, EVEX_V256;
	}
	}

	defm VFIXUPIMMSS : avx512_fixupimm_scalar<0x55, "vfixupimm", X86VFixupimmScalar,
	f32x_info, v4i32x_info>,
	AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
	defm VFIXUPIMMSD : avx512_fixupimm_scalar<0x55, "vfixupimm", X86VFixupimmScalar,
	f64x_info, v2i64x_info>,
	AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
	defm VFIXUPIMMPS : avx512_fixupimm_packed_all<avx512vl_f32_info>,
	EVEX_CD8<32, CD8VF>;
	defm VFIXUPIMMPD : avx512_fixupimm_packed_all<avx512vl_f64_info>,
	EVEX_CD8<64, CD8VF>, VEX_W;



	// Patterns used to select SSE scalar fp arithmetic instructions from
	// either:
	//
	// (1) a scalar fp operation followed by a blend
	//
	// The effect is that the backend no longer emits unnecessary vector
	// insert instructions immediately after SSE scalar fp instructions
	// like addss or mulss.
	//
	// For example, given the following code:
	// __m128 foo(__m128 A, __m128 B) {
	// A[0] += B[0];
	// return A;
	// }
	//
	// Previously we generated:
	// addss %xmm0, %xmm1
	// movss %xmm1, %xmm0
	//
	// We now generate:
	// addss %xmm1, %xmm0
	//
	// (2) a vector packed single/double fp operation followed by a vector insert
	//
	// The effect is that the backend converts the packed fp instruction
	// followed by a vector insert into a single SSE scalar fp instruction.
	//
	// For example, given the following code:
	// __m128 foo(__m128 A, __m128 B) {
	// __m128 C = A + B;
	// return (__m128) {c[0], a[1], a[2], a[3]};
	// }
	//
	// Previously we generated:
	// addps %xmm0, %xmm1
	// movss %xmm1, %xmm0
	//
	// We now generate:
	// addss %xmm1, %xmm0

	// TODO: Some canonicalization in lowering would simplify the number of
	// patterns we have to try to match.
	multiclass AVX512_scalar_math_f32_patterns<SDNode Op, string OpcPrefix> {
	let Predicates = [HasAVX512] in {
	// extracted scalar math op with insert via movss
	def : Pat<(v4f32 (X86Movss (v4f32 VR128X:$dst), (v4f32 (scalar_to_vector
	(Op (f32 (extractelt (v4f32 VR128X:$dst), (iPTR 0))),
	FR32X:$src))))),
	(!cast<I>("V"#OpcPrefix#SSZrr_Int) v4f32:$dst,
	(COPY_TO_REGCLASS FR32X:$src, VR128X))>;

	// extracted scalar math op with insert via blend
	def : Pat<(v4f32 (X86Blendi (v4f32 VR128X:$dst), (v4f32 (scalar_to_vector
	(Op (f32 (extractelt (v4f32 VR128X:$dst), (iPTR 0))),
	FR32X:$src))), (i8 1))),
	(!cast<I>("V"#OpcPrefix#SSZrr_Int) v4f32:$dst,
	(COPY_TO_REGCLASS FR32X:$src, VR128X))>;

	// vector math op with insert via movss
	def : Pat<(v4f32 (X86Movss (v4f32 VR128X:$dst),
	(Op (v4f32 VR128X:$dst), (v4f32 VR128X:$src)))),
	(!cast<I>("V"#OpcPrefix#SSZrr_Int) v4f32:$dst, v4f32:$src)>;

	// vector math op with insert via blend
	def : Pat<(v4f32 (X86Blendi (v4f32 VR128X:$dst),
	(Op (v4f32 VR128X:$dst), (v4f32 VR128X:$src)), (i8 1))),
	(!cast<I>("V"#OpcPrefix#SSZrr_Int) v4f32:$dst, v4f32:$src)>;

	// extracted masked scalar math op with insert via movss
	def : Pat<(X86Movss (v4f32 VR128X:$src1),
	(scalar_to_vector
	(X86selects VK1WM:$mask,
	(Op (f32 (extractelt (v4f32 VR128X:$src1), (iPTR 0))),
	FR32X:$src2),
	FR32X:$src0))),
	(!cast<I>("V"#OpcPrefix#SSZrr_Intk) (COPY_TO_REGCLASS FR32X:$src0, VR128X),
	VK1WM:$mask, v4f32:$src1,
	(COPY_TO_REGCLASS FR32X:$src2, VR128X))>;
	}
	}

	defm : AVX512_scalar_math_f32_patterns<fadd, "ADD">;
	defm : AVX512_scalar_math_f32_patterns<fsub, "SUB">;
	defm : AVX512_scalar_math_f32_patterns<fmul, "MUL">;
	defm : AVX512_scalar_math_f32_patterns<fdiv, "DIV">;

	multiclass AVX512_scalar_math_f64_patterns<SDNode Op, string OpcPrefix> {
	let Predicates = [HasAVX512] in {
	// extracted scalar math op with insert via movsd
	def : Pat<(v2f64 (X86Movsd (v2f64 VR128X:$dst), (v2f64 (scalar_to_vector
	(Op (f64 (extractelt (v2f64 VR128X:$dst), (iPTR 0))),
	FR64X:$src))))),
	(!cast<I>("V"#OpcPrefix#SDZrr_Int) v2f64:$dst,
	(COPY_TO_REGCLASS FR64X:$src, VR128X))>;

	// extracted scalar math op with insert via blend
	def : Pat<(v2f64 (X86Blendi (v2f64 VR128X:$dst), (v2f64 (scalar_to_vector
	(Op (f64 (extractelt (v2f64 VR128X:$dst), (iPTR 0))),
	FR64X:$src))), (i8 1))),
	(!cast<I>("V"#OpcPrefix#SDZrr_Int) v2f64:$dst,
	(COPY_TO_REGCLASS FR64X:$src, VR128X))>;

	// vector math op with insert via movsd
	def : Pat<(v2f64 (X86Movsd (v2f64 VR128X:$dst),
	(Op (v2f64 VR128X:$dst), (v2f64 VR128X:$src)))),
	(!cast<I>("V"#OpcPrefix#SDZrr_Int) v2f64:$dst, v2f64:$src)>;

	// vector math op with insert via blend
	def : Pat<(v2f64 (X86Blendi (v2f64 VR128X:$dst),
	(Op (v2f64 VR128X:$dst), (v2f64 VR128X:$src)), (i8 1))),
	(!cast<I>("V"#OpcPrefix#SDZrr_Int) v2f64:$dst, v2f64:$src)>;

	// extracted masked scalar math op with insert via movss
	def : Pat<(X86Movsd (v2f64 VR128X:$src1),
	(scalar_to_vector
	(X86selects VK1WM:$mask,
	(Op (f64 (extractelt (v2f64 VR128X:$src1), (iPTR 0))),
	FR64X:$src2),
	FR64X:$src0))),
	(!cast<I>("V"#OpcPrefix#SDZrr_Intk) (COPY_TO_REGCLASS FR64X:$src0, VR128X),
	VK1WM:$mask, v2f64:$src1,
	(COPY_TO_REGCLASS FR64X:$src2, VR128X))>;
	}
	}

	defm : AVX512_scalar_math_f64_patterns<fadd, "ADD">;
	defm : AVX512_scalar_math_f64_patterns<fsub, "SUB">;
	defm : AVX512_scalar_math_f64_patterns<fmul, "MUL">;
	defm : AVX512_scalar_math_f64_patterns<fdiv, "DIV">;
	diff --git a/lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp b/lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp
	index a7de79306074..fc15dc1e6032 100644
	--- a/lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp
	+++ b/lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp
	@@ -1,160 +1,167 @@
	//===- DlltoolDriver.cpp - dlltool.exe-compatible driver ------------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// Defines an interface to a dlltool.exe-compatible driver.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/ToolDrivers/llvm-dlltool/DlltoolDriver.h"
	#include "llvm/Object/ArchiveWriter.h"
	#include "llvm/Object/COFF.h"
	#include "llvm/Object/COFFImportFile.h"
	#include "llvm/Object/COFFModuleDefinition.h"
	#include "llvm/Option/Arg.h"
	#include "llvm/Option/ArgList.h"
	#include "llvm/Option/Option.h"
	#include "llvm/Support/Path.h"

	#include <string>
	#include <vector>

	using namespace llvm;
	using namespace llvm::object;
	using namespace llvm::COFF;

	namespace {

	enum {
	OPT_INVALID = 0,
	#define OPTION(_1, _2, ID, _4, _5, _6, _7, _8, _9, _10, _11, _12) OPT_##ID,
	#include "Options.inc"
	#undef OPTION
	};

	#define PREFIX(NAME, VALUE) const char *const NAME[] = VALUE;
	#include "Options.inc"
	#undef PREFIX

	static const llvm::opt::OptTable::Info infoTable[] = {
	#define OPTION(X1, X2, ID, KIND, GROUP, ALIAS, X7, X8, X9, X10, X11, X12) \
	{X1, X2, X10, X11, OPT_##ID, llvm::opt::Option::KIND##Class, \
	X9, X8, OPT_##GROUP, OPT_##ALIAS, X7, X12},
	#include "Options.inc"
	#undef OPTION
	};

	class DllOptTable : public llvm::opt::OptTable {
	public:
	DllOptTable() : OptTable(infoTable, false) {}
	};

	} // namespace

	std::vector<std::unique_ptr<MemoryBuffer>> OwningMBs;

	// Opens a file. Path has to be resolved already.
	// Newly created memory buffers are owned by this driver.
	-MemoryBufferRef openFile(StringRef Path) {
	+Optional<MemoryBufferRef> openFile(StringRef Path) {
	ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> MB = MemoryBuffer::getFile(Path);

	- if (std::error_code EC = MB.getError())
	+ if (std::error_code EC = MB.getError()) {
	llvm::errs() << "fail openFile: " << EC.message() << "\n";
	+ return None;
	+ }

	MemoryBufferRef MBRef = MB.get()->getMemBufferRef();
	OwningMBs.push_back(std::move(MB.get())); // take ownership
	return MBRef;
	}

	static MachineTypes getEmulation(StringRef S) {
	return StringSwitch<MachineTypes>(S)
	.Case("i386", IMAGE_FILE_MACHINE_I386)
	.Case("i386:x86-64", IMAGE_FILE_MACHINE_AMD64)
	.Case("arm", IMAGE_FILE_MACHINE_ARMNT)
	.Default(IMAGE_FILE_MACHINE_UNKNOWN);
	}

	static std::string getImplibPath(std::string Path) {
	SmallString<128> Out = StringRef("lib");
	Out.append(Path);
	sys::path::replace_extension(Out, ".a");
	return Out.str();
	}

	int llvm::dlltoolDriverMain(llvm::ArrayRef<const char *> ArgsArr) {
	DllOptTable Table;
	unsigned MissingIndex;
	unsigned MissingCount;
	llvm::opt::InputArgList Args =
	Table.ParseArgs(ArgsArr.slice(1), MissingIndex, MissingCount);
	if (MissingCount) {
	llvm::errs() << Args.getArgString(MissingIndex) << ": missing argument\n";
	return 1;
	}

	// Handle when no input or output is specified
	if (Args.hasArgNoClaim(OPT_INPUT) \|\|
	(!Args.hasArgNoClaim(OPT_d) && !Args.hasArgNoClaim(OPT_l))) {
	Table.PrintHelp(outs(), ArgsArr[0], "dlltool", false);
	llvm::outs() << "\nTARGETS: i386, i386:x86-64, arm\n";
	return 1;
	}

	if (!Args.hasArgNoClaim(OPT_m) && Args.hasArgNoClaim(OPT_d)) {
	llvm::errs() << "error: no target machine specified\n"
	<< "supported targets: i386, i386:x86-64, arm\n";
	return 1;
	}

	for (auto *Arg : Args.filtered(OPT_UNKNOWN))
	llvm::errs() << "ignoring unknown argument: " << Arg->getSpelling() << "\n";

	- MemoryBufferRef MB;
	- if (auto *Arg = Args.getLastArg(OPT_d))
	- MB = openFile(Arg->getValue());
	+ if (!Args.hasArg(OPT_d)) {
	+ llvm::errs() << "no definition file specified\n";
	+ return 1;
	+ }
	+
	+ Optional<MemoryBufferRef> MB = openFile(Args.getLastArg(OPT_d)->getValue());
	+ if (!MB)
	+ return 1;

	- if (!MB.getBufferSize()) {
	+ if (!MB->getBufferSize()) {
	llvm::errs() << "definition file empty\n";
	return 1;
	}

	COFF::MachineTypes Machine = IMAGE_FILE_MACHINE_UNKNOWN;
	if (auto *Arg = Args.getLastArg(OPT_m))
	Machine = getEmulation(Arg->getValue());

	if (Machine == IMAGE_FILE_MACHINE_UNKNOWN) {
	llvm::errs() << "unknown target\n";
	return 1;
	}

	Expected<COFFModuleDefinition> Def =
	- parseCOFFModuleDefinition(MB, Machine, true);
	+ parseCOFFModuleDefinition(*MB, Machine, true);

	if (!Def) {
	llvm::errs() << "error parsing definition\n"
	<< errorToErrorCode(Def.takeError()).message();
	return 1;
	}

	// Do this after the parser because parseCOFFModuleDefinition sets OutputFile.
	if (auto *Arg = Args.getLastArg(OPT_D))
	Def->OutputFile = Arg->getValue();

	if (Def->OutputFile.empty()) {
	llvm::errs() << "no output file specified\n";
	return 1;
	}

	std::string Path = Args.getLastArgValue(OPT_l);
	if (Path.empty())
	Path = getImplibPath(Def->OutputFile);

	- if (writeImportLibrary(Def->OutputFile, Path, Def->Exports, Machine))
	+ if (writeImportLibrary(Def->OutputFile, Path, Def->Exports, Machine, true))
	return 1;
	return 0;
	}
	diff --git a/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
	index a33490f6e4ac..ddc975cbed1a 100644
	--- a/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
	+++ b/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
	@@ -1,1628 +1,1638 @@
	//===-- DataFlowSanitizer.cpp - dynamic data flow analysis ----------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	/// \file
	/// This file is a part of DataFlowSanitizer, a generalised dynamic data flow
	/// analysis.
	///
	/// Unlike other Sanitizer tools, this tool is not designed to detect a specific
	/// class of bugs on its own. Instead, it provides a generic dynamic data flow
	/// analysis framework to be used by clients to help detect application-specific
	/// issues within their own code.
	///
	/// The analysis is based on automatic propagation of data flow labels (also
	/// known as taint labels) through a program as it performs computation. Each
	/// byte of application memory is backed by two bytes of shadow memory which
	/// hold the label. On Linux/x86_64, memory is laid out as follows:
	///
	/// +--------------------+ 0x800000000000 (top of memory)
	/// \| application memory \|
	/// +--------------------+ 0x700000008000 (kAppAddr)
	/// \| \|
	/// \| unused \|
	/// \| \|
	/// +--------------------+ 0x200200000000 (kUnusedAddr)
	/// \| union table \|
	/// +--------------------+ 0x200000000000 (kUnionTableAddr)
	/// \| shadow memory \|
	/// +--------------------+ 0x000000010000 (kShadowAddr)
	/// \| reserved by kernel \|
	/// +--------------------+ 0x000000000000
	///
	/// To derive a shadow memory address from an application memory address,
	/// bits 44-46 are cleared to bring the address into the range
	/// [0x000000008000,0x100000000000). Then the address is shifted left by 1 to
	/// account for the double byte representation of shadow labels and move the
	/// address into the shadow memory range. See the function
	/// DataFlowSanitizer::getShadowAddress below.
	///
	/// For more information, please refer to the design document:
	/// http://clang.llvm.org/docs/DataFlowSanitizerDesign.html

	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/DenseSet.h"
	#include "llvm/ADT/DepthFirstIterator.h"
	#include "llvm/ADT/StringExtras.h"
	#include "llvm/ADT/Triple.h"
	#include "llvm/Analysis/ValueTracking.h"
	#include "llvm/IR/DebugInfo.h"
	#include "llvm/IR/Dominators.h"
	#include "llvm/IR/IRBuilder.h"
	#include "llvm/IR/InlineAsm.h"
	#include "llvm/IR/InstVisitor.h"
	#include "llvm/IR/LLVMContext.h"
	#include "llvm/IR/MDBuilder.h"
	#include "llvm/IR/Type.h"
	#include "llvm/IR/Value.h"
	#include "llvm/Pass.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/SpecialCaseList.h"
	#include "llvm/Transforms/Instrumentation.h"
	#include "llvm/Transforms/Utils/BasicBlockUtils.h"
	#include "llvm/Transforms/Utils/Local.h"
	#include <algorithm>
	#include <iterator>
	#include <set>
	#include <utility>

	using namespace llvm;

	// External symbol to be used when generating the shadow address for
	// architectures with multiple VMAs. Instead of using a constant integer
	// the runtime will set the external mask based on the VMA range.
	static const char *const kDFSanExternShadowPtrMask = "__dfsan_shadow_ptr_mask";

	// The -dfsan-preserve-alignment flag controls whether this pass assumes that
	// alignment requirements provided by the input IR are correct. For example,
	// if the input IR contains a load with alignment 8, this flag will cause
	// the shadow load to have alignment 16. This flag is disabled by default as
	// we have unfortunately encountered too much code (including Clang itself;
	// see PR14291) which performs misaligned access.
	static cl::opt<bool> ClPreserveAlignment(
	"dfsan-preserve-alignment",
	cl::desc("respect alignment requirements provided by input IR"), cl::Hidden,
	cl::init(false));

	// The ABI list files control how shadow parameters are passed. The pass treats
	// every function labelled "uninstrumented" in the ABI list file as conforming
	// to the "native" (i.e. unsanitized) ABI. Unless the ABI list contains
	// additional annotations for those functions, a call to one of those functions
	// will produce a warning message, as the labelling behaviour of the function is
	// unknown. The other supported annotations are "functional" and "discard",
	// which are described below under DataFlowSanitizer::WrapperKind.
	static cl::list<std::string> ClABIListFiles(
	"dfsan-abilist",
	cl::desc("File listing native ABI functions and how the pass treats them"),
	cl::Hidden);

	// Controls whether the pass uses IA_Args or IA_TLS as the ABI for instrumented
	// functions (see DataFlowSanitizer::InstrumentedABI below).
	static cl::opt<bool> ClArgsABI(
	"dfsan-args-abi",
	cl::desc("Use the argument ABI rather than the TLS ABI"),
	cl::Hidden);

	// Controls whether the pass includes or ignores the labels of pointers in load
	// instructions.
	static cl::opt<bool> ClCombinePointerLabelsOnLoad(
	"dfsan-combine-pointer-labels-on-load",
	cl::desc("Combine the label of the pointer with the label of the data when "
	"loading from memory."),
	cl::Hidden, cl::init(true));

	// Controls whether the pass includes or ignores the labels of pointers in
	// stores instructions.
	static cl::opt<bool> ClCombinePointerLabelsOnStore(
	"dfsan-combine-pointer-labels-on-store",
	cl::desc("Combine the label of the pointer with the label of the data when "
	"storing in memory."),
	cl::Hidden, cl::init(false));

	static cl::opt<bool> ClDebugNonzeroLabels(
	"dfsan-debug-nonzero-labels",
	cl::desc("Insert calls to __dfsan_nonzero_label on observing a parameter, "
	"load or return with a nonzero label"),
	cl::Hidden);


	namespace {

	StringRef GetGlobalTypeString(const GlobalValue &G) {
	// Types of GlobalVariables are always pointer types.
	Type *GType = G.getValueType();
	// For now we support blacklisting struct types only.
	if (StructType *SGType = dyn_cast<StructType>(GType)) {
	if (!SGType->isLiteral())
	return SGType->getName();
	}
	return "<unknown type>";
	}

	class DFSanABIList {
	std::unique_ptr<SpecialCaseList> SCL;

	public:
	DFSanABIList() {}

	void set(std::unique_ptr<SpecialCaseList> List) { SCL = std::move(List); }

	/// Returns whether either this function or its source file are listed in the
	/// given category.
	bool isIn(const Function &F, StringRef Category) const {
	return isIn(*F.getParent(), Category) \|\|
	SCL->inSection("fun", F.getName(), Category);
	}

	/// Returns whether this global alias is listed in the given category.
	///
	/// If GA aliases a function, the alias's name is matched as a function name
	/// would be. Similarly, aliases of globals are matched like globals.
	bool isIn(const GlobalAlias &GA, StringRef Category) const {
	if (isIn(*GA.getParent(), Category))
	return true;

	if (isa<FunctionType>(GA.getValueType()))
	return SCL->inSection("fun", GA.getName(), Category);

	return SCL->inSection("global", GA.getName(), Category) \|\|
	SCL->inSection("type", GetGlobalTypeString(GA), Category);
	}

	/// Returns whether this module is listed in the given category.
	bool isIn(const Module &M, StringRef Category) const {
	return SCL->inSection("src", M.getModuleIdentifier(), Category);
	}
	};

	class DataFlowSanitizer : public ModulePass {
	friend struct DFSanFunction;
	friend class DFSanVisitor;

	enum {
	ShadowWidth = 16
	};

	/// Which ABI should be used for instrumented functions?
	enum InstrumentedABI {
	/// Argument and return value labels are passed through additional
	/// arguments and by modifying the return type.
	IA_Args,

	/// Argument and return value labels are passed through TLS variables
	/// __dfsan_arg_tls and __dfsan_retval_tls.
	IA_TLS
	};

	/// How should calls to uninstrumented functions be handled?
	enum WrapperKind {
	/// This function is present in an uninstrumented form but we don't know
	/// how it should be handled. Print a warning and call the function anyway.
	/// Don't label the return value.
	WK_Warning,

	/// This function does not write to (user-accessible) memory, and its return
	/// value is unlabelled.
	WK_Discard,

	/// This function does not write to (user-accessible) memory, and the label
	/// of its return value is the union of the label of its arguments.
	WK_Functional,

	/// Instead of calling the function, a custom wrapper __dfsw_F is called,
	/// where F is the name of the function. This function may wrap the
	/// original function or provide its own implementation. This is similar to
	/// the IA_Args ABI, except that IA_Args uses a struct return type to
	/// pass the return value shadow in a register, while WK_Custom uses an
	/// extra pointer argument to return the shadow. This allows the wrapped
	/// form of the function type to be expressed in C.
	WK_Custom
	};

	Module *Mod;
	LLVMContext *Ctx;
	IntegerType *ShadowTy;
	PointerType *ShadowPtrTy;
	IntegerType *IntptrTy;
	ConstantInt *ZeroShadow;
	ConstantInt *ShadowPtrMask;
	ConstantInt *ShadowPtrMul;
	Constant *ArgTLS;
	Constant *RetvalTLS;
	void (GetArgTLSPtr)();
	void (GetRetvalTLSPtr)();
	Constant *GetArgTLS;
	Constant *GetRetvalTLS;
	Constant *ExternalShadowMask;
	FunctionType *DFSanUnionFnTy;
	FunctionType *DFSanUnionLoadFnTy;
	FunctionType *DFSanUnimplementedFnTy;
	FunctionType *DFSanSetLabelFnTy;
	FunctionType *DFSanNonzeroLabelFnTy;
	FunctionType *DFSanVarargWrapperFnTy;
	Constant *DFSanUnionFn;
	Constant *DFSanCheckedUnionFn;
	Constant *DFSanUnionLoadFn;
	Constant *DFSanUnimplementedFn;
	Constant *DFSanSetLabelFn;
	Constant *DFSanNonzeroLabelFn;
	Constant *DFSanVarargWrapperFn;
	MDNode *ColdCallWeights;
	DFSanABIList ABIList;
	DenseMap<Value , Function > UnwrappedFnMap;
	AttrBuilder ReadOnlyNoneAttrs;
	bool DFSanRuntimeShadowMask;

	Value getShadowAddress(Value Addr, Instruction *Pos);
	bool isInstrumented(const Function *F);
	bool isInstrumented(const GlobalAlias *GA);
	FunctionType getArgsFunctionType(FunctionType T);
	FunctionType getTrampolineFunctionType(FunctionType T);
	FunctionType getCustomFunctionType(FunctionType T);
	InstrumentedABI getInstrumentedABI();
	WrapperKind getWrapperKind(Function *F);
	void addGlobalNamePrefix(GlobalValue *GV);
	Function buildWrapperFunction(Function F, StringRef NewFName,
	GlobalValue::LinkageTypes NewFLink,
	FunctionType *NewFT);
	Constant getOrBuildTrampolineFunction(FunctionType FT, StringRef FName);

	public:
	DataFlowSanitizer(
	const std::vector<std::string> &ABIListFiles = std::vector<std::string>(),
	void (getArgTLS)() = nullptr, void (getRetValTLS)() = nullptr);
	static char ID;
	bool doInitialization(Module &M) override;
	bool runOnModule(Module &M) override;
	};

	struct DFSanFunction {
	DataFlowSanitizer &DFS;
	Function *F;
	DominatorTree DT;
	DataFlowSanitizer::InstrumentedABI IA;
	bool IsNativeABI;
	Value *ArgTLSPtr;
	Value *RetvalTLSPtr;
	AllocaInst *LabelReturnAlloca;
	DenseMap<Value , Value > ValShadowMap;
	DenseMap<AllocaInst , AllocaInst > AllocaShadowMap;
	std::vector<std::pair<PHINode , PHINode > > PHIFixups;
	DenseSet<Instruction *> SkipInsts;
	std::vector<Value *> NonZeroChecks;
	bool AvoidNewBlocks;

	struct CachedCombinedShadow {
	BasicBlock *Block;
	Value *Shadow;
	};
	DenseMap<std::pair<Value , Value >, CachedCombinedShadow>
	CachedCombinedShadows;
	DenseMap<Value , std::set<Value >> ShadowElements;

	DFSanFunction(DataFlowSanitizer &DFS, Function *F, bool IsNativeABI)
	: DFS(DFS), F(F), IA(DFS.getInstrumentedABI()),
	IsNativeABI(IsNativeABI), ArgTLSPtr(nullptr), RetvalTLSPtr(nullptr),
	LabelReturnAlloca(nullptr) {
	DT.recalculate(*F);
	// FIXME: Need to track down the register allocator issue which causes poor
	// performance in pathological cases with large numbers of basic blocks.
	AvoidNewBlocks = F->size() > 1000;
	}
	Value *getArgTLSPtr();
	Value getArgTLS(unsigned Index, Instruction Pos);
	Value *getRetvalTLS();
	Value getShadow(Value V);
	void setShadow(Instruction I, Value Shadow);
	Value combineShadows(Value V1, Value V2, Instruction Pos);
	Value combineOperandShadows(Instruction Inst);
	Value loadShadow(Value ShadowAddr, uint64_t Size, uint64_t Align,
	Instruction *Pos);
	void storeShadow(Value Addr, uint64_t Size, uint64_t Align, Value Shadow,
	Instruction *Pos);
	};

	class DFSanVisitor : public InstVisitor<DFSanVisitor> {
	public:
	DFSanFunction &DFSF;
	DFSanVisitor(DFSanFunction &DFSF) : DFSF(DFSF) {}

	const DataLayout &getDataLayout() const {
	return DFSF.F->getParent()->getDataLayout();
	}

	void visitOperandShadowInst(Instruction &I);

	void visitBinaryOperator(BinaryOperator &BO);
	void visitCastInst(CastInst &CI);
	void visitCmpInst(CmpInst &CI);
	void visitGetElementPtrInst(GetElementPtrInst &GEPI);
	void visitLoadInst(LoadInst &LI);
	void visitStoreInst(StoreInst &SI);
	void visitReturnInst(ReturnInst &RI);
	void visitCallSite(CallSite CS);
	void visitPHINode(PHINode &PN);
	void visitExtractElementInst(ExtractElementInst &I);
	void visitInsertElementInst(InsertElementInst &I);
	void visitShuffleVectorInst(ShuffleVectorInst &I);
	void visitExtractValueInst(ExtractValueInst &I);
	void visitInsertValueInst(InsertValueInst &I);
	void visitAllocaInst(AllocaInst &I);
	void visitSelectInst(SelectInst &I);
	void visitMemSetInst(MemSetInst &I);
	void visitMemTransferInst(MemTransferInst &I);
	};

	}

	char DataFlowSanitizer::ID;
	INITIALIZE_PASS(DataFlowSanitizer, "dfsan",
	"DataFlowSanitizer: dynamic data flow analysis.", false, false)

	ModulePass *
	llvm::createDataFlowSanitizerPass(const std::vector<std::string> &ABIListFiles,
	void (getArgTLS)(),
	void (getRetValTLS)()) {
	return new DataFlowSanitizer(ABIListFiles, getArgTLS, getRetValTLS);
	}

	DataFlowSanitizer::DataFlowSanitizer(
	const std::vector<std::string> &ABIListFiles, void (getArgTLS)(),
	void (getRetValTLS)())
	: ModulePass(ID), GetArgTLSPtr(getArgTLS), GetRetvalTLSPtr(getRetValTLS),
	DFSanRuntimeShadowMask(false) {
	std::vector<std::string> AllABIListFiles(std::move(ABIListFiles));
	AllABIListFiles.insert(AllABIListFiles.end(), ClABIListFiles.begin(),
	ClABIListFiles.end());
	ABIList.set(SpecialCaseList::createOrDie(AllABIListFiles));
	}

	FunctionType DataFlowSanitizer::getArgsFunctionType(FunctionType T) {
	llvm::SmallVector<Type *, 4> ArgTypes(T->param_begin(), T->param_end());
	ArgTypes.append(T->getNumParams(), ShadowTy);
	if (T->isVarArg())
	ArgTypes.push_back(ShadowPtrTy);
	Type *RetType = T->getReturnType();
	if (!RetType->isVoidTy())
	RetType = StructType::get(RetType, ShadowTy);
	return FunctionType::get(RetType, ArgTypes, T->isVarArg());
	}

	FunctionType DataFlowSanitizer::getTrampolineFunctionType(FunctionType T) {
	assert(!T->isVarArg());
	llvm::SmallVector<Type *, 4> ArgTypes;
	ArgTypes.push_back(T->getPointerTo());
	ArgTypes.append(T->param_begin(), T->param_end());
	ArgTypes.append(T->getNumParams(), ShadowTy);
	Type *RetType = T->getReturnType();
	if (!RetType->isVoidTy())
	ArgTypes.push_back(ShadowPtrTy);
	return FunctionType::get(T->getReturnType(), ArgTypes, false);
	}

	FunctionType DataFlowSanitizer::getCustomFunctionType(FunctionType T) {
	llvm::SmallVector<Type *, 4> ArgTypes;
	for (FunctionType::param_iterator i = T->param_begin(), e = T->param_end();
	i != e; ++i) {
	FunctionType *FT;
	if (isa<PointerType>(*i) && (FT = dyn_cast<FunctionType>(cast<PointerType>(
	*i)->getElementType()))) {
	ArgTypes.push_back(getTrampolineFunctionType(FT)->getPointerTo());
	ArgTypes.push_back(Type::getInt8PtrTy(*Ctx));
	} else {
	ArgTypes.push_back(*i);
	}
	}
	for (unsigned i = 0, e = T->getNumParams(); i != e; ++i)
	ArgTypes.push_back(ShadowTy);
	if (T->isVarArg())
	ArgTypes.push_back(ShadowPtrTy);
	Type *RetType = T->getReturnType();
	if (!RetType->isVoidTy())
	ArgTypes.push_back(ShadowPtrTy);
	return FunctionType::get(T->getReturnType(), ArgTypes, T->isVarArg());
	}

	bool DataFlowSanitizer::doInitialization(Module &M) {
	llvm::Triple TargetTriple(M.getTargetTriple());
	bool IsX86_64 = TargetTriple.getArch() == llvm::Triple::x86_64;
	bool IsMIPS64 = TargetTriple.getArch() == llvm::Triple::mips64 \|\|
	TargetTriple.getArch() == llvm::Triple::mips64el;
	bool IsAArch64 = TargetTriple.getArch() == llvm::Triple::aarch64 \|\|
	TargetTriple.getArch() == llvm::Triple::aarch64_be;

	const DataLayout &DL = M.getDataLayout();

	Mod = &M;
	Ctx = &M.getContext();
	ShadowTy = IntegerType::get(*Ctx, ShadowWidth);
	ShadowPtrTy = PointerType::getUnqual(ShadowTy);
	IntptrTy = DL.getIntPtrType(*Ctx);
	ZeroShadow = ConstantInt::getSigned(ShadowTy, 0);
	ShadowPtrMul = ConstantInt::getSigned(IntptrTy, ShadowWidth / 8);
	if (IsX86_64)
	ShadowPtrMask = ConstantInt::getSigned(IntptrTy, ~0x700000000000LL);
	else if (IsMIPS64)
	ShadowPtrMask = ConstantInt::getSigned(IntptrTy, ~0xF000000000LL);
	// AArch64 supports multiple VMAs and the shadow mask is set at runtime.
	else if (IsAArch64)
	DFSanRuntimeShadowMask = true;
	else
	report_fatal_error("unsupported triple");

	Type *DFSanUnionArgs[2] = { ShadowTy, ShadowTy };
	DFSanUnionFnTy =
	FunctionType::get(ShadowTy, DFSanUnionArgs, /isVarArg=/ false);
	Type *DFSanUnionLoadArgs[2] = { ShadowPtrTy, IntptrTy };
	DFSanUnionLoadFnTy =
	FunctionType::get(ShadowTy, DFSanUnionLoadArgs, /isVarArg=/ false);
	DFSanUnimplementedFnTy = FunctionType::get(
	Type::getVoidTy(Ctx), Type::getInt8PtrTy(Ctx), /isVarArg=/false);
	Type DFSanSetLabelArgs[3] = { ShadowTy, Type::getInt8PtrTy(Ctx), IntptrTy };
	DFSanSetLabelFnTy = FunctionType::get(Type::getVoidTy(*Ctx),
	DFSanSetLabelArgs, /isVarArg=/false);
	DFSanNonzeroLabelFnTy = FunctionType::get(
	Type::getVoidTy(Ctx), None, /isVarArg=*/false);
	DFSanVarargWrapperFnTy = FunctionType::get(
	Type::getVoidTy(Ctx), Type::getInt8PtrTy(Ctx), /isVarArg=/false);

	if (GetArgTLSPtr) {
	Type *ArgTLSTy = ArrayType::get(ShadowTy, 64);
	ArgTLS = nullptr;
	GetArgTLS = ConstantExpr::getIntToPtr(
	ConstantInt::get(IntptrTy, uintptr_t(GetArgTLSPtr)),
	PointerType::getUnqual(
	FunctionType::get(PointerType::getUnqual(ArgTLSTy), false)));
	}
	if (GetRetvalTLSPtr) {
	RetvalTLS = nullptr;
	GetRetvalTLS = ConstantExpr::getIntToPtr(
	ConstantInt::get(IntptrTy, uintptr_t(GetRetvalTLSPtr)),
	PointerType::getUnqual(
	FunctionType::get(PointerType::getUnqual(ShadowTy), false)));
	}

	ColdCallWeights = MDBuilder(*Ctx).createBranchWeights(1, 1000);
	return true;
	}

	bool DataFlowSanitizer::isInstrumented(const Function *F) {
	return !ABIList.isIn(*F, "uninstrumented");
	}

	bool DataFlowSanitizer::isInstrumented(const GlobalAlias *GA) {
	return !ABIList.isIn(*GA, "uninstrumented");
	}

	DataFlowSanitizer::InstrumentedABI DataFlowSanitizer::getInstrumentedABI() {
	return ClArgsABI ? IA_Args : IA_TLS;
	}

	DataFlowSanitizer::WrapperKind DataFlowSanitizer::getWrapperKind(Function *F) {
	if (ABIList.isIn(*F, "functional"))
	return WK_Functional;
	if (ABIList.isIn(*F, "discard"))
	return WK_Discard;
	if (ABIList.isIn(*F, "custom"))
	return WK_Custom;

	return WK_Warning;
	}

	void DataFlowSanitizer::addGlobalNamePrefix(GlobalValue *GV) {
	std::string GVName = GV->getName(), Prefix = "dfs$";
	GV->setName(Prefix + GVName);

	// Try to change the name of the function in module inline asm. We only do
	// this for specific asm directives, currently only ".symver", to try to avoid
	// corrupting asm which happens to contain the symbol name as a substring.
	// Note that the substitution for .symver assumes that the versioned symbol
	// also has an instrumented name.
	std::string Asm = GV->getParent()->getModuleInlineAsm();
	std::string SearchStr = ".symver " + GVName + ",";
	size_t Pos = Asm.find(SearchStr);
	if (Pos != std::string::npos) {
	Asm.replace(Pos, SearchStr.size(),
	".symver " + Prefix + GVName + "," + Prefix);
	GV->getParent()->setModuleInlineAsm(Asm);
	}
	}

	Function *
	DataFlowSanitizer::buildWrapperFunction(Function *F, StringRef NewFName,
	GlobalValue::LinkageTypes NewFLink,
	FunctionType *NewFT) {
	FunctionType *FT = F->getFunctionType();
	Function *NewF = Function::Create(NewFT, NewFLink, NewFName,
	F->getParent());
	NewF->copyAttributesFrom(F);
	NewF->removeAttributes(
	AttributeList::ReturnIndex,
	AttributeFuncs::typeIncompatible(NewFT->getReturnType()));

	BasicBlock BB = BasicBlock::Create(Ctx, "entry", NewF);
	if (F->isVarArg()) {
	NewF->removeAttributes(AttributeList::FunctionIndex,
	AttrBuilder().addAttribute("split-stack"));
	CallInst::Create(DFSanVarargWrapperFn,
	IRBuilder<>(BB).CreateGlobalStringPtr(F->getName()), "",
	BB);
	new UnreachableInst(*Ctx, BB);
	} else {
	std::vector<Value *> Args;
	unsigned n = FT->getNumParams();
	for (Function::arg_iterator ai = NewF->arg_begin(); n != 0; ++ai, --n)
	Args.push_back(&*ai);
	CallInst *CI = CallInst::Create(F, Args, "", BB);
	if (FT->getReturnType()->isVoidTy())
	ReturnInst::Create(*Ctx, BB);
	else
	ReturnInst::Create(*Ctx, CI, BB);
	}

	return NewF;
	}

	Constant DataFlowSanitizer::getOrBuildTrampolineFunction(FunctionType FT,
	StringRef FName) {
	FunctionType *FTT = getTrampolineFunctionType(FT);
	Constant *C = Mod->getOrInsertFunction(FName, FTT);
	Function *F = dyn_cast<Function>(C);
	if (F && F->isDeclaration()) {
	F->setLinkage(GlobalValue::LinkOnceODRLinkage);
	BasicBlock BB = BasicBlock::Create(Ctx, "entry", F);
	std::vector<Value *> Args;
	Function::arg_iterator AI = F->arg_begin(); ++AI;
	for (unsigned N = FT->getNumParams(); N != 0; ++AI, --N)
	Args.push_back(&*AI);
	CallInst CI = CallInst::Create(&F->arg_begin(), Args, "", BB);
	ReturnInst *RI;
	if (FT->getReturnType()->isVoidTy())
	RI = ReturnInst::Create(*Ctx, BB);
	else
	RI = ReturnInst::Create(*Ctx, CI, BB);

	DFSanFunction DFSF(this, F, /IsNativeABI=*/true);
	Function::arg_iterator ValAI = F->arg_begin(), ShadowAI = AI; ++ValAI;
	for (unsigned N = FT->getNumParams(); N != 0; ++ValAI, ++ShadowAI, --N)
	DFSF.ValShadowMap[&ValAI] = &ShadowAI;
	DFSanVisitor(DFSF).visitCallInst(*CI);
	if (!FT->getReturnType()->isVoidTy())
	new StoreInst(DFSF.getShadow(RI->getReturnValue()),
	&*std::prev(F->arg_end()), RI);
	}

	return C;
	}

	bool DataFlowSanitizer::runOnModule(Module &M) {
	if (ABIList.isIn(M, "skip"))
	return false;

	if (!GetArgTLSPtr) {
	Type *ArgTLSTy = ArrayType::get(ShadowTy, 64);
	ArgTLS = Mod->getOrInsertGlobal("__dfsan_arg_tls", ArgTLSTy);
	if (GlobalVariable *G = dyn_cast<GlobalVariable>(ArgTLS))
	G->setThreadLocalMode(GlobalVariable::InitialExecTLSModel);
	}
	if (!GetRetvalTLSPtr) {
	RetvalTLS = Mod->getOrInsertGlobal("__dfsan_retval_tls", ShadowTy);
	if (GlobalVariable *G = dyn_cast<GlobalVariable>(RetvalTLS))
	G->setThreadLocalMode(GlobalVariable::InitialExecTLSModel);
	}

	ExternalShadowMask =
	Mod->getOrInsertGlobal(kDFSanExternShadowPtrMask, IntptrTy);

	DFSanUnionFn = Mod->getOrInsertFunction("__dfsan_union", DFSanUnionFnTy);
	if (Function *F = dyn_cast<Function>(DFSanUnionFn)) {
	F->addAttribute(AttributeList::FunctionIndex, Attribute::NoUnwind);
	F->addAttribute(AttributeList::FunctionIndex, Attribute::ReadNone);
	F->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
	F->addParamAttr(0, Attribute::ZExt);
	F->addParamAttr(1, Attribute::ZExt);
	}
	DFSanCheckedUnionFn = Mod->getOrInsertFunction("dfsan_union", DFSanUnionFnTy);
	if (Function *F = dyn_cast<Function>(DFSanCheckedUnionFn)) {
	F->addAttribute(AttributeList::FunctionIndex, Attribute::NoUnwind);
	F->addAttribute(AttributeList::FunctionIndex, Attribute::ReadNone);
	F->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
	F->addParamAttr(0, Attribute::ZExt);
	F->addParamAttr(1, Attribute::ZExt);
	}
	DFSanUnionLoadFn =
	Mod->getOrInsertFunction("__dfsan_union_load", DFSanUnionLoadFnTy);
	if (Function *F = dyn_cast<Function>(DFSanUnionLoadFn)) {
	F->addAttribute(AttributeList::FunctionIndex, Attribute::NoUnwind);
	F->addAttribute(AttributeList::FunctionIndex, Attribute::ReadOnly);
	F->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
	}
	DFSanUnimplementedFn =
	Mod->getOrInsertFunction("__dfsan_unimplemented", DFSanUnimplementedFnTy);
	DFSanSetLabelFn =
	Mod->getOrInsertFunction("__dfsan_set_label", DFSanSetLabelFnTy);
	if (Function *F = dyn_cast<Function>(DFSanSetLabelFn)) {
	F->addParamAttr(0, Attribute::ZExt);
	}
	DFSanNonzeroLabelFn =
	Mod->getOrInsertFunction("__dfsan_nonzero_label", DFSanNonzeroLabelFnTy);
	DFSanVarargWrapperFn = Mod->getOrInsertFunction("__dfsan_vararg_wrapper",
	DFSanVarargWrapperFnTy);

	std::vector<Function *> FnsToInstrument;
	llvm::SmallPtrSet<Function *, 2> FnsWithNativeABI;
	for (Function &i : M) {
	if (!i.isIntrinsic() &&
	&i != DFSanUnionFn &&
	&i != DFSanCheckedUnionFn &&
	&i != DFSanUnionLoadFn &&
	&i != DFSanUnimplementedFn &&
	&i != DFSanSetLabelFn &&
	&i != DFSanNonzeroLabelFn &&
	&i != DFSanVarargWrapperFn)
	FnsToInstrument.push_back(&i);
	}

	// Give function aliases prefixes when necessary, and build wrappers where the
	// instrumentedness is inconsistent.
	for (Module::alias_iterator i = M.alias_begin(), e = M.alias_end(); i != e;) {
	GlobalAlias GA = &i;
	++i;
	// Don't stop on weak. We assume people aren't playing games with the
	// instrumentedness of overridden weak aliases.
	if (auto F = dyn_cast<Function>(GA->getBaseObject())) {
	bool GAInst = isInstrumented(GA), FInst = isInstrumented(F);
	if (GAInst && FInst) {
	addGlobalNamePrefix(GA);
	} else if (GAInst != FInst) {
	// Non-instrumented alias of an instrumented function, or vice versa.
	// Replace the alias with a native-ABI wrapper of the aliasee. The pass
	// below will take care of instrumenting it.
	Function *NewF =
	buildWrapperFunction(F, "", GA->getLinkage(), F->getFunctionType());
	GA->replaceAllUsesWith(ConstantExpr::getBitCast(NewF, GA->getType()));
	NewF->takeName(GA);
	GA->eraseFromParent();
	FnsToInstrument.push_back(NewF);
	}
	}
	}

	ReadOnlyNoneAttrs.addAttribute(Attribute::ReadOnly)
	.addAttribute(Attribute::ReadNone);

	// First, change the ABI of every function in the module. ABI-listed
	// functions keep their original ABI and get a wrapper function.
	for (std::vector<Function *>::iterator i = FnsToInstrument.begin(),
	e = FnsToInstrument.end();
	i != e; ++i) {
	Function &F = **i;
	FunctionType *FT = F.getFunctionType();

	bool IsZeroArgsVoidRet = (FT->getNumParams() == 0 && !FT->isVarArg() &&
	FT->getReturnType()->isVoidTy());

	if (isInstrumented(&F)) {
	// Instrumented functions get a 'dfs$' prefix. This allows us to more
	// easily identify cases of mismatching ABIs.
	if (getInstrumentedABI() == IA_Args && !IsZeroArgsVoidRet) {
	FunctionType *NewFT = getArgsFunctionType(FT);
	Function *NewF = Function::Create(NewFT, F.getLinkage(), "", &M);
	NewF->copyAttributesFrom(&F);
	NewF->removeAttributes(
	AttributeList::ReturnIndex,
	AttributeFuncs::typeIncompatible(NewFT->getReturnType()));
	for (Function::arg_iterator FArg = F.arg_begin(),
	NewFArg = NewF->arg_begin(),
	FArgEnd = F.arg_end();
	FArg != FArgEnd; ++FArg, ++NewFArg) {
	FArg->replaceAllUsesWith(&*NewFArg);
	}
	NewF->getBasicBlockList().splice(NewF->begin(), F.getBasicBlockList());

	for (Function::user_iterator UI = F.user_begin(), UE = F.user_end();
	UI != UE;) {
	BlockAddress BA = dyn_cast<BlockAddress>(UI);
	++UI;
	if (BA) {
	BA->replaceAllUsesWith(
	BlockAddress::get(NewF, BA->getBasicBlock()));
	delete BA;
	}
	}
	F.replaceAllUsesWith(
	ConstantExpr::getBitCast(NewF, PointerType::getUnqual(FT)));
	NewF->takeName(&F);
	F.eraseFromParent();
	*i = NewF;
	addGlobalNamePrefix(NewF);
	} else {
	addGlobalNamePrefix(&F);
	}
	} else if (!IsZeroArgsVoidRet \|\| getWrapperKind(&F) == WK_Custom) {
	// Build a wrapper function for F. The wrapper simply calls F, and is
	// added to FnsToInstrument so that any instrumentation according to its
	// WrapperKind is done in the second pass below.
	FunctionType *NewFT = getInstrumentedABI() == IA_Args
	? getArgsFunctionType(FT)
	: FT;
	Function *NewF = buildWrapperFunction(
	&F, std::string("dfsw$") + std::string(F.getName()),
	GlobalValue::LinkOnceODRLinkage, NewFT);
	if (getInstrumentedABI() == IA_TLS)
	NewF->removeAttributes(AttributeList::FunctionIndex, ReadOnlyNoneAttrs);

	Value *WrappedFnCst =
	ConstantExpr::getBitCast(NewF, PointerType::getUnqual(FT));
	F.replaceAllUsesWith(WrappedFnCst);

	UnwrappedFnMap[WrappedFnCst] = &F;
	*i = NewF;

	if (!F.isDeclaration()) {
	// This function is probably defining an interposition of an
	// uninstrumented function and hence needs to keep the original ABI.
	// But any functions it may call need to use the instrumented ABI, so
	// we instrument it in a mode which preserves the original ABI.
	FnsWithNativeABI.insert(&F);

	// This code needs to rebuild the iterators, as they may be invalidated
	// by the push_back, taking care that the new range does not include
	// any functions added by this code.
	size_t N = i - FnsToInstrument.begin(),
	Count = e - FnsToInstrument.begin();
	FnsToInstrument.push_back(&F);
	i = FnsToInstrument.begin() + N;
	e = FnsToInstrument.begin() + Count;
	}
	// Hopefully, nobody will try to indirectly call a vararg
	// function... yet.
	} else if (FT->isVarArg()) {
	UnwrappedFnMap[&F] = &F;
	*i = nullptr;
	}
	}

	for (Function *i : FnsToInstrument) {
	if (!i \|\| i->isDeclaration())
	continue;

	removeUnreachableBlocks(*i);

	DFSanFunction DFSF(*this, i, FnsWithNativeABI.count(i));

	// DFSanVisitor may create new basic blocks, which confuses df_iterator.
	// Build a copy of the list before iterating over it.
	llvm::SmallVector<BasicBlock *, 4> BBList(depth_first(&i->getEntryBlock()));

	for (BasicBlock *i : BBList) {
	Instruction *Inst = &i->front();
	while (1) {
	// DFSanVisitor may split the current basic block, changing the current
	// instruction's next pointer and moving the next instruction to the
	// tail block from which we should continue.
	Instruction *Next = Inst->getNextNode();
	// DFSanVisitor may delete Inst, so keep track of whether it was a
	// terminator.
	bool IsTerminator = isa<TerminatorInst>(Inst);
	if (!DFSF.SkipInsts.count(Inst))
	DFSanVisitor(DFSF).visit(Inst);
	if (IsTerminator)
	break;
	Inst = Next;
	}
	}

	// We will not necessarily be able to compute the shadow for every phi node
	// until we have visited every block. Therefore, the code that handles phi
	// nodes adds them to the PHIFixups list so that they can be properly
	// handled here.
	for (std::vector<std::pair<PHINode , PHINode > >::iterator
	i = DFSF.PHIFixups.begin(),
	e = DFSF.PHIFixups.end();
	i != e; ++i) {
	for (unsigned val = 0, n = i->first->getNumIncomingValues(); val != n;
	++val) {
	i->second->setIncomingValue(
	val, DFSF.getShadow(i->first->getIncomingValue(val)));
	}
	}

	// -dfsan-debug-nonzero-labels will split the CFG in all kinds of crazy
	// places (i.e. instructions in basic blocks we haven't even begun visiting
	// yet). To make our life easier, do this work in a pass after the main
	// instrumentation.
	if (ClDebugNonzeroLabels) {
	for (Value *V : DFSF.NonZeroChecks) {
	Instruction *Pos;
	if (Instruction *I = dyn_cast<Instruction>(V))
	Pos = I->getNextNode();
	else
	Pos = &DFSF.F->getEntryBlock().front();
	while (isa<PHINode>(Pos) \|\| isa<AllocaInst>(Pos))
	Pos = Pos->getNextNode();
	IRBuilder<> IRB(Pos);
	Value *Ne = IRB.CreateICmpNE(V, DFSF.DFS.ZeroShadow);
	BranchInst *BI = cast<BranchInst>(SplitBlockAndInsertIfThen(
	Ne, Pos, /Unreachable=/false, ColdCallWeights));
	IRBuilder<> ThenIRB(BI);
	ThenIRB.CreateCall(DFSF.DFS.DFSanNonzeroLabelFn, {});
	}
	}
	}

	return false;
	}

	Value *DFSanFunction::getArgTLSPtr() {
	if (ArgTLSPtr)
	return ArgTLSPtr;
	if (DFS.ArgTLS)
	return ArgTLSPtr = DFS.ArgTLS;

	IRBuilder<> IRB(&F->getEntryBlock().front());
	return ArgTLSPtr = IRB.CreateCall(DFS.GetArgTLS, {});
	}

	Value *DFSanFunction::getRetvalTLS() {
	if (RetvalTLSPtr)
	return RetvalTLSPtr;
	if (DFS.RetvalTLS)
	return RetvalTLSPtr = DFS.RetvalTLS;

	IRBuilder<> IRB(&F->getEntryBlock().front());
	return RetvalTLSPtr = IRB.CreateCall(DFS.GetRetvalTLS, {});
	}

	Value DFSanFunction::getArgTLS(unsigned Idx, Instruction Pos) {
	IRBuilder<> IRB(Pos);
	return IRB.CreateConstGEP2_64(getArgTLSPtr(), 0, Idx);
	}

	Value DFSanFunction::getShadow(Value V) {
	if (!isa<Argument>(V) && !isa<Instruction>(V))
	return DFS.ZeroShadow;
	Value *&Shadow = ValShadowMap[V];
	if (!Shadow) {
	if (Argument *A = dyn_cast<Argument>(V)) {
	if (IsNativeABI)
	return DFS.ZeroShadow;
	switch (IA) {
	case DataFlowSanitizer::IA_TLS: {
	Value *ArgTLSPtr = getArgTLSPtr();
	Instruction *ArgTLSPos =
	DFS.ArgTLS ? &*F->getEntryBlock().begin()
	: cast<Instruction>(ArgTLSPtr)->getNextNode();
	IRBuilder<> IRB(ArgTLSPos);
	Shadow = IRB.CreateLoad(getArgTLS(A->getArgNo(), ArgTLSPos));
	break;
	}
	case DataFlowSanitizer::IA_Args: {
	unsigned ArgIdx = A->getArgNo() + F->arg_size() / 2;
	Function::arg_iterator i = F->arg_begin();
	while (ArgIdx--)
	++i;
	Shadow = &*i;
	assert(Shadow->getType() == DFS.ShadowTy);
	break;
	}
	}
	NonZeroChecks.push_back(Shadow);
	} else {
	Shadow = DFS.ZeroShadow;
	}
	}
	return Shadow;
	}

	void DFSanFunction::setShadow(Instruction I, Value Shadow) {
	assert(!ValShadowMap.count(I));
	assert(Shadow->getType() == DFS.ShadowTy);
	ValShadowMap[I] = Shadow;
	}

	Value DataFlowSanitizer::getShadowAddress(Value Addr, Instruction *Pos) {
	assert(Addr != RetvalTLS && "Reinstrumenting?");
	IRBuilder<> IRB(Pos);
	Value *ShadowPtrMaskValue;
	if (DFSanRuntimeShadowMask)
	ShadowPtrMaskValue = IRB.CreateLoad(IntptrTy, ExternalShadowMask);
	else
	ShadowPtrMaskValue = ShadowPtrMask;
	return IRB.CreateIntToPtr(
	IRB.CreateMul(
	IRB.CreateAnd(IRB.CreatePtrToInt(Addr, IntptrTy),
	IRB.CreatePtrToInt(ShadowPtrMaskValue, IntptrTy)),
	ShadowPtrMul),
	ShadowPtrTy);
	}

	// Generates IR to compute the union of the two given shadows, inserting it
	// before Pos. Returns the computed union Value.
	Value DFSanFunction::combineShadows(Value V1, Value V2, Instruction Pos) {
	if (V1 == DFS.ZeroShadow)
	return V2;
	if (V2 == DFS.ZeroShadow)
	return V1;
	if (V1 == V2)
	return V1;

	auto V1Elems = ShadowElements.find(V1);
	auto V2Elems = ShadowElements.find(V2);
	if (V1Elems != ShadowElements.end() && V2Elems != ShadowElements.end()) {
	if (std::includes(V1Elems->second.begin(), V1Elems->second.end(),
	V2Elems->second.begin(), V2Elems->second.end())) {
	return V1;
	} else if (std::includes(V2Elems->second.begin(), V2Elems->second.end(),
	V1Elems->second.begin(), V1Elems->second.end())) {
	return V2;
	}
	} else if (V1Elems != ShadowElements.end()) {
	if (V1Elems->second.count(V2))
	return V1;
	} else if (V2Elems != ShadowElements.end()) {
	if (V2Elems->second.count(V1))
	return V2;
	}

	auto Key = std::make_pair(V1, V2);
	if (V1 > V2)
	std::swap(Key.first, Key.second);
	CachedCombinedShadow &CCS = CachedCombinedShadows[Key];
	if (CCS.Block && DT.dominates(CCS.Block, Pos->getParent()))
	return CCS.Shadow;

	IRBuilder<> IRB(Pos);
	if (AvoidNewBlocks) {
	CallInst *Call = IRB.CreateCall(DFS.DFSanCheckedUnionFn, {V1, V2});
	Call->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
	Call->addParamAttr(0, Attribute::ZExt);
	Call->addParamAttr(1, Attribute::ZExt);

	CCS.Block = Pos->getParent();
	CCS.Shadow = Call;
	} else {
	BasicBlock *Head = Pos->getParent();
	Value *Ne = IRB.CreateICmpNE(V1, V2);
	BranchInst *BI = cast<BranchInst>(SplitBlockAndInsertIfThen(
	Ne, Pos, /Unreachable=/false, DFS.ColdCallWeights, &DT));
	IRBuilder<> ThenIRB(BI);
	CallInst *Call = ThenIRB.CreateCall(DFS.DFSanUnionFn, {V1, V2});
	Call->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
	Call->addParamAttr(0, Attribute::ZExt);
	Call->addParamAttr(1, Attribute::ZExt);

	BasicBlock *Tail = BI->getSuccessor(0);
	PHINode *Phi = PHINode::Create(DFS.ShadowTy, 2, "", &Tail->front());
	Phi->addIncoming(Call, Call->getParent());
	Phi->addIncoming(V1, Head);

	CCS.Block = Tail;
	CCS.Shadow = Phi;
	}

	std::set<Value *> UnionElems;
	if (V1Elems != ShadowElements.end()) {
	UnionElems = V1Elems->second;
	} else {
	UnionElems.insert(V1);
	}
	if (V2Elems != ShadowElements.end()) {
	UnionElems.insert(V2Elems->second.begin(), V2Elems->second.end());
	} else {
	UnionElems.insert(V2);
	}
	ShadowElements[CCS.Shadow] = std::move(UnionElems);

	return CCS.Shadow;
	}

	// A convenience function which folds the shadows of each of the operands
	// of the provided instruction Inst, inserting the IR before Inst. Returns
	// the computed union Value.
	Value DFSanFunction::combineOperandShadows(Instruction Inst) {
	if (Inst->getNumOperands() == 0)
	return DFS.ZeroShadow;

	Value *Shadow = getShadow(Inst->getOperand(0));
	for (unsigned i = 1, n = Inst->getNumOperands(); i != n; ++i) {
	Shadow = combineShadows(Shadow, getShadow(Inst->getOperand(i)), Inst);
	}
	return Shadow;
	}

	void DFSanVisitor::visitOperandShadowInst(Instruction &I) {
	Value *CombinedShadow = DFSF.combineOperandShadows(&I);
	DFSF.setShadow(&I, CombinedShadow);
	}

	// Generates IR to load shadow corresponding to bytes [Addr, Addr+Size), where
	// Addr has alignment Align, and take the union of each of those shadows.
	Value DFSanFunction::loadShadow(Value Addr, uint64_t Size, uint64_t Align,
	Instruction *Pos) {
	if (AllocaInst *AI = dyn_cast<AllocaInst>(Addr)) {
	llvm::DenseMap<AllocaInst , AllocaInst >::iterator i =
	AllocaShadowMap.find(AI);
	if (i != AllocaShadowMap.end()) {
	IRBuilder<> IRB(Pos);
	return IRB.CreateLoad(i->second);
	}
	}

	uint64_t ShadowAlign = Align * DFS.ShadowWidth / 8;
	SmallVector<Value *, 2> Objs;
	GetUnderlyingObjects(Addr, Objs, Pos->getModule()->getDataLayout());
	bool AllConstants = true;
	for (Value *Obj : Objs) {
	if (isa<Function>(Obj) \|\| isa<BlockAddress>(Obj))
	continue;
	if (isa<GlobalVariable>(Obj) && cast<GlobalVariable>(Obj)->isConstant())
	continue;

	AllConstants = false;
	break;
	}
	if (AllConstants)
	return DFS.ZeroShadow;

	Value *ShadowAddr = DFS.getShadowAddress(Addr, Pos);
	switch (Size) {
	case 0:
	return DFS.ZeroShadow;
	case 1: {
	LoadInst *LI = new LoadInst(ShadowAddr, "", Pos);
	LI->setAlignment(ShadowAlign);
	return LI;
	}
	case 2: {
	IRBuilder<> IRB(Pos);
	Value *ShadowAddr1 = IRB.CreateGEP(DFS.ShadowTy, ShadowAddr,
	ConstantInt::get(DFS.IntptrTy, 1));
	return combineShadows(IRB.CreateAlignedLoad(ShadowAddr, ShadowAlign),
	IRB.CreateAlignedLoad(ShadowAddr1, ShadowAlign), Pos);
	}
	}
	if (!AvoidNewBlocks && Size % (64 / DFS.ShadowWidth) == 0) {
	// Fast path for the common case where each byte has identical shadow: load
	// shadow 64 bits at a time, fall out to a __dfsan_union_load call if any
	// shadow is non-equal.
	BasicBlock FallbackBB = BasicBlock::Create(DFS.Ctx, "", F);
	IRBuilder<> FallbackIRB(FallbackBB);
	CallInst *FallbackCall = FallbackIRB.CreateCall(
	DFS.DFSanUnionLoadFn,
	{ShadowAddr, ConstantInt::get(DFS.IntptrTy, Size)});
	FallbackCall->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);

	// Compare each of the shadows stored in the loaded 64 bits to each other,
	// by computing (WideShadow rotl ShadowWidth) == WideShadow.
	IRBuilder<> IRB(Pos);
	Value *WideAddr =
	IRB.CreateBitCast(ShadowAddr, Type::getInt64PtrTy(*DFS.Ctx));
	Value *WideShadow = IRB.CreateAlignedLoad(WideAddr, ShadowAlign);
	Value *TruncShadow = IRB.CreateTrunc(WideShadow, DFS.ShadowTy);
	Value *ShlShadow = IRB.CreateShl(WideShadow, DFS.ShadowWidth);
	Value *ShrShadow = IRB.CreateLShr(WideShadow, 64 - DFS.ShadowWidth);
	Value *RotShadow = IRB.CreateOr(ShlShadow, ShrShadow);
	Value *ShadowsEq = IRB.CreateICmpEQ(WideShadow, RotShadow);

	BasicBlock *Head = Pos->getParent();
	BasicBlock *Tail = Head->splitBasicBlock(Pos->getIterator());

	if (DomTreeNode *OldNode = DT.getNode(Head)) {
	std::vector<DomTreeNode *> Children(OldNode->begin(), OldNode->end());

	DomTreeNode *NewNode = DT.addNewBlock(Tail, Head);
	for (auto Child : Children)
	DT.changeImmediateDominator(Child, NewNode);
	}

	// In the following code LastBr will refer to the previous basic block's
	// conditional branch instruction, whose true successor is fixed up to point
	// to the next block during the loop below or to the tail after the final
	// iteration.
	BranchInst *LastBr = BranchInst::Create(FallbackBB, FallbackBB, ShadowsEq);
	ReplaceInstWithInst(Head->getTerminator(), LastBr);
	DT.addNewBlock(FallbackBB, Head);

	for (uint64_t Ofs = 64 / DFS.ShadowWidth; Ofs != Size;
	Ofs += 64 / DFS.ShadowWidth) {
	BasicBlock NextBB = BasicBlock::Create(DFS.Ctx, "", F);
	DT.addNewBlock(NextBB, LastBr->getParent());
	IRBuilder<> NextIRB(NextBB);
	WideAddr = NextIRB.CreateGEP(Type::getInt64Ty(*DFS.Ctx), WideAddr,
	ConstantInt::get(DFS.IntptrTy, 1));
	Value *NextWideShadow = NextIRB.CreateAlignedLoad(WideAddr, ShadowAlign);
	ShadowsEq = NextIRB.CreateICmpEQ(WideShadow, NextWideShadow);
	LastBr->setSuccessor(0, NextBB);
	LastBr = NextIRB.CreateCondBr(ShadowsEq, FallbackBB, FallbackBB);
	}

	LastBr->setSuccessor(0, Tail);
	FallbackIRB.CreateBr(Tail);
	PHINode *Shadow = PHINode::Create(DFS.ShadowTy, 2, "", &Tail->front());
	Shadow->addIncoming(FallbackCall, FallbackBB);
	Shadow->addIncoming(TruncShadow, LastBr->getParent());
	return Shadow;
	}

	IRBuilder<> IRB(Pos);
	CallInst *FallbackCall = IRB.CreateCall(
	DFS.DFSanUnionLoadFn, {ShadowAddr, ConstantInt::get(DFS.IntptrTy, Size)});
	FallbackCall->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
	return FallbackCall;
	}

	void DFSanVisitor::visitLoadInst(LoadInst &LI) {
	auto &DL = LI.getModule()->getDataLayout();
	uint64_t Size = DL.getTypeStoreSize(LI.getType());
	if (Size == 0) {
	DFSF.setShadow(&LI, DFSF.DFS.ZeroShadow);
	return;
	}

	uint64_t Align;
	if (ClPreserveAlignment) {
	Align = LI.getAlignment();
	if (Align == 0)
	Align = DL.getABITypeAlignment(LI.getType());
	} else {
	Align = 1;
	}
	IRBuilder<> IRB(&LI);
	Value *Shadow = DFSF.loadShadow(LI.getPointerOperand(), Size, Align, &LI);
	if (ClCombinePointerLabelsOnLoad) {
	Value *PtrShadow = DFSF.getShadow(LI.getPointerOperand());
	Shadow = DFSF.combineShadows(Shadow, PtrShadow, &LI);
	}
	if (Shadow != DFSF.DFS.ZeroShadow)
	DFSF.NonZeroChecks.push_back(Shadow);

	DFSF.setShadow(&LI, Shadow);
	}

	void DFSanFunction::storeShadow(Value *Addr, uint64_t Size, uint64_t Align,
	Value Shadow, Instruction Pos) {
	if (AllocaInst *AI = dyn_cast<AllocaInst>(Addr)) {
	llvm::DenseMap<AllocaInst , AllocaInst >::iterator i =
	AllocaShadowMap.find(AI);
	if (i != AllocaShadowMap.end()) {
	IRBuilder<> IRB(Pos);
	IRB.CreateStore(Shadow, i->second);
	return;
	}
	}

	uint64_t ShadowAlign = Align * DFS.ShadowWidth / 8;
	IRBuilder<> IRB(Pos);
	Value *ShadowAddr = DFS.getShadowAddress(Addr, Pos);
	if (Shadow == DFS.ZeroShadow) {
	IntegerType ShadowTy = IntegerType::get(DFS.Ctx, Size * DFS.ShadowWidth);
	Value *ExtZeroShadow = ConstantInt::get(ShadowTy, 0);
	Value *ExtShadowAddr =
	IRB.CreateBitCast(ShadowAddr, PointerType::getUnqual(ShadowTy));
	IRB.CreateAlignedStore(ExtZeroShadow, ExtShadowAddr, ShadowAlign);
	return;
	}

	const unsigned ShadowVecSize = 128 / DFS.ShadowWidth;
	uint64_t Offset = 0;
	if (Size >= ShadowVecSize) {
	VectorType *ShadowVecTy = VectorType::get(DFS.ShadowTy, ShadowVecSize);
	Value *ShadowVec = UndefValue::get(ShadowVecTy);
	for (unsigned i = 0; i != ShadowVecSize; ++i) {
	ShadowVec = IRB.CreateInsertElement(
	ShadowVec, Shadow, ConstantInt::get(Type::getInt32Ty(*DFS.Ctx), i));
	}
	Value *ShadowVecAddr =
	IRB.CreateBitCast(ShadowAddr, PointerType::getUnqual(ShadowVecTy));
	do {
	Value *CurShadowVecAddr =
	IRB.CreateConstGEP1_32(ShadowVecTy, ShadowVecAddr, Offset);
	IRB.CreateAlignedStore(ShadowVec, CurShadowVecAddr, ShadowAlign);
	Size -= ShadowVecSize;
	++Offset;
	} while (Size >= ShadowVecSize);
	Offset *= ShadowVecSize;
	}
	while (Size > 0) {
	Value *CurShadowAddr =
	IRB.CreateConstGEP1_32(DFS.ShadowTy, ShadowAddr, Offset);
	IRB.CreateAlignedStore(Shadow, CurShadowAddr, ShadowAlign);
	--Size;
	++Offset;
	}
	}

	void DFSanVisitor::visitStoreInst(StoreInst &SI) {
	auto &DL = SI.getModule()->getDataLayout();
	uint64_t Size = DL.getTypeStoreSize(SI.getValueOperand()->getType());
	if (Size == 0)
	return;

	uint64_t Align;
	if (ClPreserveAlignment) {
	Align = SI.getAlignment();
	if (Align == 0)
	Align = DL.getABITypeAlignment(SI.getValueOperand()->getType());
	} else {
	Align = 1;
	}

	Value* Shadow = DFSF.getShadow(SI.getValueOperand());
	if (ClCombinePointerLabelsOnStore) {
	Value *PtrShadow = DFSF.getShadow(SI.getPointerOperand());
	Shadow = DFSF.combineShadows(Shadow, PtrShadow, &SI);
	}
	DFSF.storeShadow(SI.getPointerOperand(), Size, Align, Shadow, &SI);
	}

	void DFSanVisitor::visitBinaryOperator(BinaryOperator &BO) {
	visitOperandShadowInst(BO);
	}

	void DFSanVisitor::visitCastInst(CastInst &CI) { visitOperandShadowInst(CI); }

	void DFSanVisitor::visitCmpInst(CmpInst &CI) { visitOperandShadowInst(CI); }

	void DFSanVisitor::visitGetElementPtrInst(GetElementPtrInst &GEPI) {
	visitOperandShadowInst(GEPI);
	}

	void DFSanVisitor::visitExtractElementInst(ExtractElementInst &I) {
	visitOperandShadowInst(I);
	}

	void DFSanVisitor::visitInsertElementInst(InsertElementInst &I) {
	visitOperandShadowInst(I);
	}

	void DFSanVisitor::visitShuffleVectorInst(ShuffleVectorInst &I) {
	visitOperandShadowInst(I);
	}

	void DFSanVisitor::visitExtractValueInst(ExtractValueInst &I) {
	visitOperandShadowInst(I);
	}

	void DFSanVisitor::visitInsertValueInst(InsertValueInst &I) {
	visitOperandShadowInst(I);
	}

	void DFSanVisitor::visitAllocaInst(AllocaInst &I) {
	bool AllLoadsStores = true;
	for (User *U : I.users()) {
	if (isa<LoadInst>(U))
	continue;

	if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
	if (SI->getPointerOperand() == &I)
	continue;
	}

	AllLoadsStores = false;
	break;
	}
	if (AllLoadsStores) {
	IRBuilder<> IRB(&I);
	DFSF.AllocaShadowMap[&I] = IRB.CreateAlloca(DFSF.DFS.ShadowTy);
	}
	DFSF.setShadow(&I, DFSF.DFS.ZeroShadow);
	}

	void DFSanVisitor::visitSelectInst(SelectInst &I) {
	Value *CondShadow = DFSF.getShadow(I.getCondition());
	Value *TrueShadow = DFSF.getShadow(I.getTrueValue());
	Value *FalseShadow = DFSF.getShadow(I.getFalseValue());

	if (isa<VectorType>(I.getCondition()->getType())) {
	DFSF.setShadow(
	&I,
	DFSF.combineShadows(
	CondShadow, DFSF.combineShadows(TrueShadow, FalseShadow, &I), &I));
	} else {
	Value *ShadowSel;
	if (TrueShadow == FalseShadow) {
	ShadowSel = TrueShadow;
	} else {
	ShadowSel =
	SelectInst::Create(I.getCondition(), TrueShadow, FalseShadow, "", &I);
	}
	DFSF.setShadow(&I, DFSF.combineShadows(CondShadow, ShadowSel, &I));
	}
	}

	void DFSanVisitor::visitMemSetInst(MemSetInst &I) {
	IRBuilder<> IRB(&I);
	Value *ValShadow = DFSF.getShadow(I.getValue());
	IRB.CreateCall(DFSF.DFS.DFSanSetLabelFn,
	{ValShadow, IRB.CreateBitCast(I.getDest(), Type::getInt8PtrTy(
	*DFSF.DFS.Ctx)),
	IRB.CreateZExtOrTrunc(I.getLength(), DFSF.DFS.IntptrTy)});
	}

	void DFSanVisitor::visitMemTransferInst(MemTransferInst &I) {
	IRBuilder<> IRB(&I);
	Value *DestShadow = DFSF.DFS.getShadowAddress(I.getDest(), &I);
	Value *SrcShadow = DFSF.DFS.getShadowAddress(I.getSource(), &I);
	Value *LenShadow = IRB.CreateMul(
	I.getLength(),
	ConstantInt::get(I.getLength()->getType(), DFSF.DFS.ShadowWidth / 8));
	Value *AlignShadow;
	if (ClPreserveAlignment) {
	AlignShadow = IRB.CreateMul(I.getAlignmentCst(),
	ConstantInt::get(I.getAlignmentCst()->getType(),
	DFSF.DFS.ShadowWidth / 8));
	} else {
	AlignShadow = ConstantInt::get(I.getAlignmentCst()->getType(),
	DFSF.DFS.ShadowWidth / 8);
	}
	Type Int8Ptr = Type::getInt8PtrTy(DFSF.DFS.Ctx);
	DestShadow = IRB.CreateBitCast(DestShadow, Int8Ptr);
	SrcShadow = IRB.CreateBitCast(SrcShadow, Int8Ptr);
	IRB.CreateCall(I.getCalledValue(), {DestShadow, SrcShadow, LenShadow,
	AlignShadow, I.getVolatileCst()});
	}

	void DFSanVisitor::visitReturnInst(ReturnInst &RI) {
	if (!DFSF.IsNativeABI && RI.getReturnValue()) {
	switch (DFSF.IA) {
	case DataFlowSanitizer::IA_TLS: {
	Value *S = DFSF.getShadow(RI.getReturnValue());
	IRBuilder<> IRB(&RI);
	IRB.CreateStore(S, DFSF.getRetvalTLS());
	break;
	}
	case DataFlowSanitizer::IA_Args: {
	IRBuilder<> IRB(&RI);
	Type *RT = DFSF.F->getFunctionType()->getReturnType();
	Value *InsVal =
	IRB.CreateInsertValue(UndefValue::get(RT), RI.getReturnValue(), 0);
	Value *InsShadow =
	IRB.CreateInsertValue(InsVal, DFSF.getShadow(RI.getReturnValue()), 1);
	RI.setOperand(0, InsShadow);
	break;
	}
	}
	}
	}

	void DFSanVisitor::visitCallSite(CallSite CS) {
	Function *F = CS.getCalledFunction();
	if ((F && F->isIntrinsic()) \|\| isa<InlineAsm>(CS.getCalledValue())) {
	visitOperandShadowInst(*CS.getInstruction());
	return;
	}

	// Calls to this function are synthesized in wrappers, and we shouldn't
	// instrument them.
	if (F == DFSF.DFS.DFSanVarargWrapperFn)
	return;

	IRBuilder<> IRB(CS.getInstruction());

	DenseMap<Value , Function >::iterator i =
	DFSF.DFS.UnwrappedFnMap.find(CS.getCalledValue());
	if (i != DFSF.DFS.UnwrappedFnMap.end()) {
	Function *F = i->second;
	switch (DFSF.DFS.getWrapperKind(F)) {
	case DataFlowSanitizer::WK_Warning: {
	CS.setCalledFunction(F);
	IRB.CreateCall(DFSF.DFS.DFSanUnimplementedFn,
	IRB.CreateGlobalStringPtr(F->getName()));
	DFSF.setShadow(CS.getInstruction(), DFSF.DFS.ZeroShadow);
	return;
	}
	case DataFlowSanitizer::WK_Discard: {
	CS.setCalledFunction(F);
	DFSF.setShadow(CS.getInstruction(), DFSF.DFS.ZeroShadow);
	return;
	}
	case DataFlowSanitizer::WK_Functional: {
	CS.setCalledFunction(F);
	visitOperandShadowInst(*CS.getInstruction());
	return;
	}
	case DataFlowSanitizer::WK_Custom: {
	// Don't try to handle invokes of custom functions, it's too complicated.
	// Instead, invoke the dfsw$ wrapper, which will in turn call the __dfsw_
	// wrapper.
	if (CallInst *CI = dyn_cast<CallInst>(CS.getInstruction())) {
	FunctionType *FT = F->getFunctionType();
	FunctionType *CustomFT = DFSF.DFS.getCustomFunctionType(FT);
	std::string CustomFName = "__dfsw_";
	CustomFName += F->getName();
	Constant *CustomF =
	DFSF.DFS.Mod->getOrInsertFunction(CustomFName, CustomFT);
	if (Function *CustomFn = dyn_cast<Function>(CustomF)) {
	CustomFn->copyAttributesFrom(F);

	// Custom functions returning non-void will write to the return label.
	if (!FT->getReturnType()->isVoidTy()) {
	CustomFn->removeAttributes(AttributeList::FunctionIndex,
	DFSF.DFS.ReadOnlyNoneAttrs);
	}
	}

	std::vector<Value *> Args;

	CallSite::arg_iterator i = CS.arg_begin();
	for (unsigned n = FT->getNumParams(); n != 0; ++i, --n) {
	Type T = (i)->getType();
	FunctionType *ParamFT;
	if (isa<PointerType>(T) &&
	(ParamFT = dyn_cast<FunctionType>(
	cast<PointerType>(T)->getElementType()))) {
	std::string TName = "dfst";
	TName += utostr(FT->getNumParams() - n);
	TName += "$";
	TName += F->getName();
	Constant *T = DFSF.DFS.getOrBuildTrampolineFunction(ParamFT, TName);
	Args.push_back(T);
	Args.push_back(
	IRB.CreateBitCast(i, Type::getInt8PtrTy(DFSF.DFS.Ctx)));
	} else {
	Args.push_back(*i);
	}
	}

	i = CS.arg_begin();
	+ const unsigned ShadowArgStart = Args.size();
	for (unsigned n = FT->getNumParams(); n != 0; ++i, --n)
	Args.push_back(DFSF.getShadow(*i));

	if (FT->isVarArg()) {
	auto *LabelVATy = ArrayType::get(DFSF.DFS.ShadowTy,
	CS.arg_size() - FT->getNumParams());
	auto *LabelVAAlloca = new AllocaInst(
	LabelVATy, getDataLayout().getAllocaAddrSpace(),
	"labelva", &DFSF.F->getEntryBlock().front());

	for (unsigned n = 0; i != CS.arg_end(); ++i, ++n) {
	auto LabelVAPtr = IRB.CreateStructGEP(LabelVATy, LabelVAAlloca, n);
	IRB.CreateStore(DFSF.getShadow(*i), LabelVAPtr);
	}

	Args.push_back(IRB.CreateStructGEP(LabelVATy, LabelVAAlloca, 0));
	}

	if (!FT->getReturnType()->isVoidTy()) {
	if (!DFSF.LabelReturnAlloca) {
	DFSF.LabelReturnAlloca =
	new AllocaInst(DFSF.DFS.ShadowTy,
	getDataLayout().getAllocaAddrSpace(),
	"labelreturn", &DFSF.F->getEntryBlock().front());
	}
	Args.push_back(DFSF.LabelReturnAlloca);
	}

	for (i = CS.arg_begin() + FT->getNumParams(); i != CS.arg_end(); ++i)
	Args.push_back(*i);

	CallInst *CustomCI = IRB.CreateCall(CustomF, Args);
	CustomCI->setCallingConv(CI->getCallingConv());
	CustomCI->setAttributes(CI->getAttributes());

	+ // Update the parameter attributes of the custom call instruction to
	+ // zero extend the shadow parameters. This is required for targets
	+ // which consider ShadowTy an illegal type.
	+ for (unsigned n = 0; n < FT->getNumParams(); n++) {
	+ const unsigned ArgNo = ShadowArgStart + n;
	+ if (CustomCI->getArgOperand(ArgNo)->getType() == DFSF.DFS.ShadowTy)
	+ CustomCI->addParamAttr(ArgNo, Attribute::ZExt);
	+ }
	+
	if (!FT->getReturnType()->isVoidTy()) {
	LoadInst *LabelLoad = IRB.CreateLoad(DFSF.LabelReturnAlloca);
	DFSF.setShadow(CustomCI, LabelLoad);
	}

	CI->replaceAllUsesWith(CustomCI);
	CI->eraseFromParent();
	return;
	}
	break;
	}
	}
	}

	FunctionType *FT = cast<FunctionType>(
	CS.getCalledValue()->getType()->getPointerElementType());
	if (DFSF.DFS.getInstrumentedABI() == DataFlowSanitizer::IA_TLS) {
	for (unsigned i = 0, n = FT->getNumParams(); i != n; ++i) {
	IRB.CreateStore(DFSF.getShadow(CS.getArgument(i)),
	DFSF.getArgTLS(i, CS.getInstruction()));
	}
	}

	Instruction *Next = nullptr;
	if (!CS.getType()->isVoidTy()) {
	if (InvokeInst *II = dyn_cast<InvokeInst>(CS.getInstruction())) {
	if (II->getNormalDest()->getSinglePredecessor()) {
	Next = &II->getNormalDest()->front();
	} else {
	BasicBlock *NewBB =
	SplitEdge(II->getParent(), II->getNormalDest(), &DFSF.DT);
	Next = &NewBB->front();
	}
	} else {
	assert(CS->getIterator() != CS->getParent()->end());
	Next = CS->getNextNode();
	}

	if (DFSF.DFS.getInstrumentedABI() == DataFlowSanitizer::IA_TLS) {
	IRBuilder<> NextIRB(Next);
	LoadInst *LI = NextIRB.CreateLoad(DFSF.getRetvalTLS());
	DFSF.SkipInsts.insert(LI);
	DFSF.setShadow(CS.getInstruction(), LI);
	DFSF.NonZeroChecks.push_back(LI);
	}
	}

	// Do all instrumentation for IA_Args down here to defer tampering with the
	// CFG in a way that SplitEdge may be able to detect.
	if (DFSF.DFS.getInstrumentedABI() == DataFlowSanitizer::IA_Args) {
	FunctionType *NewFT = DFSF.DFS.getArgsFunctionType(FT);
	Value *Func =
	IRB.CreateBitCast(CS.getCalledValue(), PointerType::getUnqual(NewFT));
	std::vector<Value *> Args;

	CallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
	for (unsigned n = FT->getNumParams(); n != 0; ++i, --n)
	Args.push_back(*i);

	i = CS.arg_begin();
	for (unsigned n = FT->getNumParams(); n != 0; ++i, --n)
	Args.push_back(DFSF.getShadow(*i));

	if (FT->isVarArg()) {
	unsigned VarArgSize = CS.arg_size() - FT->getNumParams();
	ArrayType *VarArgArrayTy = ArrayType::get(DFSF.DFS.ShadowTy, VarArgSize);
	AllocaInst *VarArgShadow =
	new AllocaInst(VarArgArrayTy, getDataLayout().getAllocaAddrSpace(),
	"", &DFSF.F->getEntryBlock().front());
	Args.push_back(IRB.CreateConstGEP2_32(VarArgArrayTy, VarArgShadow, 0, 0));
	for (unsigned n = 0; i != e; ++i, ++n) {
	IRB.CreateStore(
	DFSF.getShadow(*i),
	IRB.CreateConstGEP2_32(VarArgArrayTy, VarArgShadow, 0, n));
	Args.push_back(*i);
	}
	}

	CallSite NewCS;
	if (InvokeInst *II = dyn_cast<InvokeInst>(CS.getInstruction())) {
	NewCS = IRB.CreateInvoke(Func, II->getNormalDest(), II->getUnwindDest(),
	Args);
	} else {
	NewCS = IRB.CreateCall(Func, Args);
	}
	NewCS.setCallingConv(CS.getCallingConv());
	NewCS.setAttributes(CS.getAttributes().removeAttributes(
	*DFSF.DFS.Ctx, AttributeList::ReturnIndex,
	AttributeFuncs::typeIncompatible(NewCS.getInstruction()->getType())));

	if (Next) {
	ExtractValueInst *ExVal =
	ExtractValueInst::Create(NewCS.getInstruction(), 0, "", Next);
	DFSF.SkipInsts.insert(ExVal);
	ExtractValueInst *ExShadow =
	ExtractValueInst::Create(NewCS.getInstruction(), 1, "", Next);
	DFSF.SkipInsts.insert(ExShadow);
	DFSF.setShadow(ExVal, ExShadow);
	DFSF.NonZeroChecks.push_back(ExShadow);

	CS.getInstruction()->replaceAllUsesWith(ExVal);
	}

	CS.getInstruction()->eraseFromParent();
	}
	}

	void DFSanVisitor::visitPHINode(PHINode &PN) {
	PHINode *ShadowPN =
	PHINode::Create(DFSF.DFS.ShadowTy, PN.getNumIncomingValues(), "", &PN);

	// Give the shadow phi node valid predecessors to fool SplitEdge into working.
	Value *UndefShadow = UndefValue::get(DFSF.DFS.ShadowTy);
	for (PHINode::block_iterator i = PN.block_begin(), e = PN.block_end(); i != e;
	++i) {
	ShadowPN->addIncoming(UndefShadow, *i);
	}

	DFSF.PHIFixups.push_back(std::make_pair(&PN, ShadowPN));
	DFSF.setShadow(&PN, ShadowPN);
	}
	diff --git a/lib/Transforms/Scalar/BDCE.cpp b/lib/Transforms/Scalar/BDCE.cpp
	index 61e8700f1cd6..2e5618686ec2 100644
	--- a/lib/Transforms/Scalar/BDCE.cpp
	+++ b/lib/Transforms/Scalar/BDCE.cpp
	@@ -1,118 +1,162 @@
	//===---- BDCE.cpp - Bit-tracking dead code elimination -------------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements the Bit-Tracking Dead Code Elimination pass. Some
	// instructions (shifts, some ands, ors, etc.) kill some of their input bits.
	// We track these dead bits and remove instructions that compute only these
	// dead bits.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/Transforms/Scalar/BDCE.h"
	+#include "llvm/ADT/SmallPtrSet.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/Analysis/DemandedBits.h"
	#include "llvm/Analysis/GlobalsModRef.h"
	#include "llvm/IR/CFG.h"
	#include "llvm/IR/InstIterator.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/IntrinsicInst.h"
	#include "llvm/IR/Operator.h"
	#include "llvm/Pass.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Transforms/Scalar.h"
	using namespace llvm;

	#define DEBUG_TYPE "bdce"

	STATISTIC(NumRemoved, "Number of instructions removed (unused)");
	STATISTIC(NumSimplified, "Number of instructions trivialized (dead bits)");

	+/// If an instruction is trivialized (dead), then the chain of users of that
	+/// instruction may need to be cleared of assumptions that can no longer be
	+/// guaranteed correct.
	+static void clearAssumptionsOfUsers(Instruction *I, DemandedBits &DB) {
	+ assert(I->getType()->isIntegerTy() && "Trivializing a non-integer value?");
	+
	+ // Initialize the worklist with eligible direct users.
	+ SmallVector<Instruction *, 16> WorkList;
	+ for (User *JU : I->users()) {
	+ // If all bits of a user are demanded, then we know that nothing below that
	+ // in the def-use chain needs to be changed.
	+ auto *J = dyn_cast<Instruction>(JU);
	+ if (J && !DB.getDemandedBits(J).isAllOnesValue())
	+ WorkList.push_back(J);
	+ }
	+
	+ // DFS through subsequent users while tracking visits to avoid cycles.
	+ SmallPtrSet<Instruction *, 16> Visited;
	+ while (!WorkList.empty()) {
	+ Instruction *J = WorkList.pop_back_val();
	+
	+ // NSW, NUW, and exact are based on operands that might have changed.
	+ J->dropPoisonGeneratingFlags();
	+
	+ // We do not have to worry about llvm.assume or range metadata:
	+ // 1. llvm.assume demands its operand, so trivializing can't change it.
	+ // 2. range metadata only applies to memory accesses which demand all bits.
	+
	+ Visited.insert(J);
	+
	+ for (User *KU : J->users()) {
	+ // If all bits of a user are demanded, then we know that nothing below
	+ // that in the def-use chain needs to be changed.
	+ auto *K = dyn_cast<Instruction>(KU);
	+ if (K && !Visited.count(K) && !DB.getDemandedBits(K).isAllOnesValue())
	+ WorkList.push_back(K);
	+ }
	+ }
	+}
	+
	static bool bitTrackingDCE(Function &F, DemandedBits &DB) {
	SmallVector<Instruction*, 128> Worklist;
	bool Changed = false;
	for (Instruction &I : instructions(F)) {
	// If the instruction has side effects and no non-dbg uses,
	// skip it. This way we avoid computing known bits on an instruction
	// that will not help us.
	if (I.mayHaveSideEffects() && I.use_empty())
	continue;

	if (I.getType()->isIntegerTy() &&
	!DB.getDemandedBits(&I).getBoolValue()) {
	// For live instructions that have all dead bits, first make them dead by
	// replacing all uses with something else. Then, if they don't need to
	// remain live (because they have side effects, etc.) we can remove them.
	DEBUG(dbgs() << "BDCE: Trivializing: " << I << " (all bits dead)\n");
	+
	+ clearAssumptionsOfUsers(&I, DB);
	+
	// FIXME: In theory we could substitute undef here instead of zero.
	// This should be reconsidered once we settle on the semantics of
	// undef, poison, etc.
	Value *Zero = ConstantInt::get(I.getType(), 0);
	++NumSimplified;
	I.replaceNonMetadataUsesWith(Zero);
	Changed = true;
	}
	if (!DB.isInstructionDead(&I))
	continue;

	Worklist.push_back(&I);
	I.dropAllReferences();
	Changed = true;
	}

	for (Instruction *&I : Worklist) {
	++NumRemoved;
	I->eraseFromParent();
	}

	return Changed;
	}

	PreservedAnalyses BDCEPass::run(Function &F, FunctionAnalysisManager &AM) {
	auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
	if (!bitTrackingDCE(F, DB))
	return PreservedAnalyses::all();

	PreservedAnalyses PA;
	PA.preserveSet<CFGAnalyses>();
	PA.preserve<GlobalsAA>();
	return PA;
	}

	namespace {
	struct BDCELegacyPass : public FunctionPass {
	static char ID; // Pass identification, replacement for typeid
	BDCELegacyPass() : FunctionPass(ID) {
	initializeBDCELegacyPassPass(*PassRegistry::getPassRegistry());
	}

	bool runOnFunction(Function &F) override {
	if (skipFunction(F))
	return false;
	auto &DB = getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
	return bitTrackingDCE(F, DB);
	}

	void getAnalysisUsage(AnalysisUsage &AU) const override {
	AU.setPreservesCFG();
	AU.addRequired<DemandedBitsWrapperPass>();
	AU.addPreserved<GlobalsAAWrapperPass>();
	}
	};
	}

	char BDCELegacyPass::ID = 0;
	INITIALIZE_PASS_BEGIN(BDCELegacyPass, "bdce",
	"Bit-Tracking Dead Code Elimination", false, false)
	INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
	INITIALIZE_PASS_END(BDCELegacyPass, "bdce",
	"Bit-Tracking Dead Code Elimination", false, false)

	FunctionPass *llvm::createBitTrackingDCEPass() { return new BDCELegacyPass(); }
	diff --git a/test/Analysis/ScalarEvolution/max-addrec-size.ll b/test/Analysis/ScalarEvolution/max-addrec-size.ll
	new file mode 100644
	index 000000000000..aad0ddda37bc
	--- /dev/null
	+++ b/test/Analysis/ScalarEvolution/max-addrec-size.ll
	@@ -0,0 +1,33 @@
	+; RUN: opt -analyze -scalar-evolution -scalar-evolution-max-add-rec-size=3 < %s \| FileCheck %s
	+
	+; Show that we are able to avoid creation of huge SCEVs by capping the max
	+; AddRec size.
	+define i32 @test_01(i32 %a, i32 %b) {
	+
	+; CHECK-LABEL: Classifying expressions for: @test_01
	+; CHECK-NEXT: %iv = phi i32 [ %a, %entry ], [ %iv.next, %loop ]
	+; CHECK-NEXT: --> {%a,+,%b}<%loop> U: full-set S: full-set
	+; CHECK-NEXT: %iv.next = add i32 %iv, %b
	+; CHECK-NEXT: --> {(%a + %b),+,%b}<%loop> U: full-set S: full-set
	+; CHECK-NEXT: %x1 = mul i32 %iv, %iv.next
	+; CHECK-NEXT: --> {((%a + %b) * %a),+,(((2 * %a) + (2 * %b)) * %b),+,(2 * %b * %b)}<%loop> U: full-set S: full-set
	+; CHECK-NEXT: %x2 = mul i32 %x1, %x1
	+; CHECK-NEXT: --> ({((%a + %b) * %a),+,(((2 * %a) + (2 * %b)) * %b),+,(2 * %b * %b)}<%loop> * {((%a + %b) * %a),+,(((2 * %a) + (2 * %b)) * %b),+,(2 * %b * %b)}<%loop>) U: full-set S: full-set
	+; CHECK-NEXT: %x3 = mul i32 %x2, %x1
	+; CHECK-NEXT: --> ({((%a + %b) * %a),+,(((2 * %a) + (2 * %b)) * %b),+,(2 * %b * %b)}<%loop> * {((%a + %b) * %a),+,(((2 * %a) + (2 * %b)) * %b),+,(2 * %b * %b)}<%loop> * {((%a + %b) * %a),+,(((2 * %a) + (2 * %b)) * %b),+,(2 * %b * %b)}<%loop>) U: full-set S: full-set
	+
	+entry:
	+ br label %loop
	+
	+loop:
	+ %iv = phi i32 [ %a, %entry ], [ %iv.next, %loop ]
	+ %iv.next = add i32 %iv, %b
	+ %cond = icmp slt i32 %iv.next, 1000
	+ br i1 %cond, label %loop, label %exit
	+
	+exit:
	+ %x1 = mul i32 %iv, %iv.next
	+ %x2 = mul i32 %x1, %x1
	+ %x3 = mul i32 %x2, %x1
	+ ret i32 %x3
	+}
	diff --git a/test/CodeGen/AArch64/arm64-ldst-unscaled-pre-post.mir b/test/CodeGen/AArch64/arm64-ldst-unscaled-pre-post.mir
	new file mode 100644
	index 000000000000..dacaf4966d07
	--- /dev/null
	+++ b/test/CodeGen/AArch64/arm64-ldst-unscaled-pre-post.mir
	@@ -0,0 +1,115 @@
	+# RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass aarch64-ldst-opt -verify-machineinstrs -o - %s \| FileCheck %s
	+---
	+# CHECK-LABEL: name: test_LDURSi_post
	+# CHECK: LDRSpost %x0, -4
	+name: test_LDURSi_post
	+body: \|
	+ bb.0.entry:
	+ liveins: %x0
	+
	+ %s0 = LDURSi %x0, 0
	+ %x0 = SUBXri %x0, 4, 0
	+ RET_ReallyLR implicit %x0
	+...
	+# CHECK-LABEL: name: test_LDURDi_post
	+# CHECK: LDRDpost %x0, -4
	+name: test_LDURDi_post
	+body: \|
	+ bb.0.entry:
	+ liveins: %x0
	+
	+ %d0 = LDURDi %x0, 0
	+ %x0 = SUBXri %x0, 4, 0
	+ RET_ReallyLR implicit %x0
	+...
	+# CHECK-LABEL: name: test_LDURQi_post
	+# CHECK: LDRQpost %x0, -4
	+name: test_LDURQi_post
	+body: \|
	+ bb.0.entry:
	+ liveins: %x0
	+
	+ %q0 = LDURQi %x0, 0
	+ %x0 = SUBXri %x0, 4, 0
	+ RET_ReallyLR implicit %x0
	+...
	+# CHECK-LABEL: name: test_LDURWi_post
	+# CHECK: LDRWpost %x0, -4
	+name: test_LDURWi_post
	+body: \|
	+ bb.0.entry:
	+ liveins: %x0
	+
	+ %w1 = LDURWi %x0, 0
	+ %x0 = SUBXri %x0, 4, 0
	+ RET_ReallyLR implicit %x0
	+...
	+# CHECK-LABEL: name: test_LDURXi_post
	+# CHECK: %x1 = LDRXpost %x0, -4
	+name: test_LDURXi_post
	+body: \|
	+ bb.0.entry:
	+ liveins: %x0
	+
	+ %x1 = LDURXi %x0, 0
	+ %x0 = SUBXri %x0, 4, 0
	+ RET_ReallyLR implicit %x0
	+...
	+# CHECK-LABEL: name: test_STURSi_post
	+# CHECK: STRSpost %s0, %x0, -4
	+name: test_STURSi_post
	+body: \|
	+ bb.0.entry:
	+ liveins: %x0
	+
	+ %s0 = FMOVS0
	+ STURSi %s0, %x0, 0
	+ %x0 = SUBXri %x0, 4, 0
	+ RET_ReallyLR implicit %x0
	+...
	+# CHECK-LABEL: name: test_STURDi_post
	+# CHECK: STRDpost %d0, %x0, -4
	+name: test_STURDi_post
	+body: \|
	+ bb.0.entry:
	+ liveins: %x0
	+
	+ %d0 = FMOVD0
	+ STURDi %d0, %x0, 0
	+ %x0 = SUBXri %x0, 4, 0
	+ RET_ReallyLR implicit %x0
	+...
	+# CHECK-LABEL: name: test_STURQi_post
	+# CHECK: STRQpost %q0, %x0, -4
	+name: test_STURQi_post
	+body: \|
	+ bb.0.entry:
	+ liveins: %x0
	+
	+ %q0 = MOVIv4i32 0, 0
	+ STURQi %q0, %x0, 0
	+ %x0 = SUBXri %x0, 4, 0
	+ RET_ReallyLR implicit %x0
	+...
	+# CHECK-LABEL: name: test_STURWi_post
	+# CHECK: STRWpost %wzr, %x0, -4
	+name: test_STURWi_post
	+body: \|
	+ bb.0.entry:
	+ liveins: %x0
	+
	+ STURWi %wzr, %x0, 0
	+ %x0 = SUBXri %x0, 4, 0
	+ RET_ReallyLR implicit %x0
	+...
	+# CHECK-LABEL: name: test_STURXi_post
	+# CHECK: STRXpost %xzr, %x0, -4
	+name: test_STURXi_post
	+body: \|
	+ bb.0.entry:
	+ liveins: %x0
	+
	+ STURXi %xzr, %x0, 0
	+ %x0 = SUBXri %x0, 4, 0
	+ RET_ReallyLR implicit %x0
	+...
	diff --git a/test/CodeGen/ARM/cmpxchg-O0.ll b/test/CodeGen/ARM/cmpxchg-O0.ll
	index a3be72112c76..f8ad2bbbbe0e 100644
	--- a/test/CodeGen/ARM/cmpxchg-O0.ll
	+++ b/test/CodeGen/ARM/cmpxchg-O0.ll
	@@ -1,116 +1,113 @@
	; RUN: llc -verify-machineinstrs -mtriple=armv7-linux-gnu -O0 %s -o - \| FileCheck %s
	; RUN: llc -verify-machineinstrs -mtriple=thumbv8-linux-gnu -O0 %s -o - \| FileCheck %s
	; RUN: llc -verify-machineinstrs -mtriple=thumbv6m-none-eabi -O0 %s -o - \| FileCheck %s --check-prefix=CHECK-T1

	; CHECK-T1-NOT: ldrex
	; CHECK-T1-NOT: strex

	define { i8, i1 } @test_cmpxchg_8(i8* %addr, i8 %desired, i8 %new) nounwind {
	; CHECK-LABEL: test_cmpxchg_8:
	; CHECK: dmb ish
	; CHECK: uxtb [[DESIRED:r[0-9]+]], [[DESIRED]]
	; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]:
	-; CHECK: mov{{s?}} [[STATUS:r[0-9]+]], #0
	; CHECK: ldrexb [[OLD:r[0-9]+]], [r0]
	; CHECK: cmp [[OLD]], [[DESIRED]]
	; CHECK: bne [[DONE:.LBB[0-9]+_[0-9]+]]
	-; CHECK: strexb [[STATUS]], r2, [r0]
	+; CHECK: strexb [[STATUS:r[0-9]+]], r2, [r0]
	; CHECK: cmp{{(\.w)?}} [[STATUS]], #0
	; CHECK: bne [[RETRY]]
	; CHECK: [[DONE]]:
	; CHECK: cmp{{(\.w)?}} [[OLD]], [[DESIRED]]
	; CHECK: {{moveq\|movweq}} {{r[0-9]+}}, #1
	; CHECK: dmb ish
	%res = cmpxchg i8* %addr, i8 %desired, i8 %new seq_cst monotonic
	ret { i8, i1 } %res
	}

	define { i16, i1 } @test_cmpxchg_16(i16* %addr, i16 %desired, i16 %new) nounwind {
	; CHECK-LABEL: test_cmpxchg_16:
	; CHECK: dmb ish
	; CHECK: uxth [[DESIRED:r[0-9]+]], [[DESIRED]]
	; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]:
	-; CHECK: mov{{s?}} [[STATUS:r[0-9]+]], #0
	; CHECK: ldrexh [[OLD:r[0-9]+]], [r0]
	; CHECK: cmp [[OLD]], [[DESIRED]]
	; CHECK: bne [[DONE:.LBB[0-9]+_[0-9]+]]
	-; CHECK: strexh [[STATUS]], r2, [r0]
	+; CHECK: strexh [[STATUS:r[0-9]+]], r2, [r0]
	; CHECK: cmp{{(\.w)?}} [[STATUS]], #0
	; CHECK: bne [[RETRY]]
	; CHECK: [[DONE]]:
	; CHECK: cmp{{(\.w)?}} [[OLD]], [[DESIRED]]
	; CHECK: {{moveq\|movweq}} {{r[0-9]+}}, #1
	; CHECK: dmb ish
	%res = cmpxchg i16* %addr, i16 %desired, i16 %new seq_cst monotonic
	ret { i16, i1 } %res
	}

	define { i32, i1 } @test_cmpxchg_32(i32* %addr, i32 %desired, i32 %new) nounwind {
	; CHECK-LABEL: test_cmpxchg_32:
	; CHECK: dmb ish
	; CHECK-NOT: uxt
	; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]:
	-; CHECK: mov{{s?}} [[STATUS:r[0-9]+]], #0
	; CHECK: ldrex [[OLD:r[0-9]+]], [r0]
	; CHECK: cmp [[OLD]], [[DESIRED]]
	; CHECK: bne [[DONE:.LBB[0-9]+_[0-9]+]]
	-; CHECK: strex [[STATUS]], r2, [r0]
	+; CHECK: strex [[STATUS:r[0-9]+]], r2, [r0]
	; CHECK: cmp{{(\.w)?}} [[STATUS]], #0
	; CHECK: bne [[RETRY]]
	; CHECK: [[DONE]]:
	; CHECK: cmp{{(\.w)?}} [[OLD]], [[DESIRED]]
	; CHECK: {{moveq\|movweq}} {{r[0-9]+}}, #1
	; CHECK: dmb ish
	%res = cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst monotonic
	ret { i32, i1 } %res
	}

	define { i64, i1 } @test_cmpxchg_64(i64* %addr, i64 %desired, i64 %new) nounwind {
	; CHECK-LABEL: test_cmpxchg_64:
	; CHECK: dmb ish
	; CHECK-NOT: uxt
	; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]:
	; CHECK: ldrexd [[OLDLO:r[0-9]+]], [[OLDHI:r[0-9]+]], [r0]
	; CHECK: cmp [[OLDLO]], r6
	; CHECK: cmpeq [[OLDHI]], r7
	; CHECK: bne [[DONE:.LBB[0-9]+_[0-9]+]]
	; CHECK: strexd [[STATUS:r[0-9]+]], r4, r5, [r0]
	; CHECK: cmp{{(\.w)?}} [[STATUS]], #0
	; CHECK: bne [[RETRY]]
	; CHECK: [[DONE]]:
	; CHECK: dmb ish
	%res = cmpxchg i64* %addr, i64 %desired, i64 %new seq_cst monotonic
	ret { i64, i1 } %res
	}

	define { i64, i1 } @test_nontrivial_args(i64* %addr, i64 %desired, i64 %new) {
	; CHECK-LABEL: test_nontrivial_args:
	; CHECK: dmb ish
	; CHECK-NOT: uxt
	; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]:
	; CHECK: ldrexd [[OLDLO:r[0-9]+]], [[OLDHI:r[0-9]+]], [r0]
	; CHECK: cmp [[OLDLO]], {{r[0-9]+}}
	; CHECK: cmpeq [[OLDHI]], {{r[0-9]+}}
	; CHECK: bne [[DONE:.LBB[0-9]+_[0-9]+]]
	; CHECK: strexd [[STATUS:r[0-9]+]], {{r[0-9]+}}, {{r[0-9]+}}, [r0]
	; CHECK: cmp{{(\.w)?}} [[STATUS]], #0
	; CHECK: bne [[RETRY]]
	; CHECK: [[DONE]]:
	; CHECK: dmb ish

	%desired1 = add i64 %desired, 1
	%new1 = add i64 %new, 1
	%res = cmpxchg i64* %addr, i64 %desired1, i64 %new1 seq_cst seq_cst
	ret { i64, i1 } %res
	}

	; The following used to trigger an assertion when creating a spill on thumb2
	; for a physreg with RC==GPRPairRegClass.
	; CHECK-LABEL: test_cmpxchg_spillbug:
	; CHECK: ldrexd
	; CHECK: strexd
	; CHECK: bne
	define void @test_cmpxchg_spillbug() {
	%v = cmpxchg i64* undef, i64 undef, i64 undef seq_cst seq_cst
	ret void
	}
	diff --git a/test/CodeGen/ARM/virtregrewriter-subregliveness.mir b/test/CodeGen/ARM/virtregrewriter-subregliveness.mir
	new file mode 100644
	index 000000000000..83335a3ccffd
	--- /dev/null
	+++ b/test/CodeGen/ARM/virtregrewriter-subregliveness.mir
	@@ -0,0 +1,84 @@
	+# RUN: llc -o - -mtriple=thumbv7--windows-gnu -run-pass=greedy -run-pass=virtregrewriter %s \| FileCheck %s
	+--- \|
	+ target datalayout = "e-m:w-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
	+ target triple = "thumbv7--windows-gnu"
	+
	+ define void @subregLiveThrough() { ret void }
	+ define void @subregNotLiveThrough() { ret void }
	+ define void @subregNotLiveThrough2() { ret void }
	+
	+...
	+---
	+# Check that we properly recognize that r1 is live through
	+# the first subreg copy.
	+# That will materialize as an implicit use of the big register
	+# on that copy.
	+# PR34107.
	+#
	+# CHECK-LABEL: name: subregLiveThrough
	+name: subregLiveThrough
	+tracksRegLiveness: true
	+registers:
	+ - { id: 0, class: gprpair }
	+body: \|
	+ bb.0:
	+ liveins: %r0, %r1
	+
	+ ; That copy is being coalesced so we should use a KILL
	+ ; placeholder. If that's not a kill that means we probably
	+ ; not coalescing %0 and %r0_r1 and thus we are not testing
	+ ; the problematic code anymore.
	+ ;
	+ ; CHECK: %r0 = KILL %r0, implicit killed %r0_r1, implicit-def %r0_r1
	+ ; CHECK-NEXT: %r1 = KILL %r1, implicit killed %r0_r1
	+ undef %0.gsub_0 = COPY %r0
	+ %0.gsub_1 = COPY %r1
	+ tBX_RET 14, _, implicit %0
	+
	+
	+...
	+
	+---
	+# Check that we properly recognize that r1 is not live through
	+# the first subreg copy.
	+# CHECK-LABEL: name: subregNotLiveThrough
	+name: subregNotLiveThrough
	+tracksRegLiveness: true
	+registers:
	+ - { id: 0, class: gprpair }
	+body: \|
	+ bb.0:
	+ liveins: %r0, %r1
	+
	+ ; r1 is not live through so check we are not implicitly using
	+ ; the big register.
	+ ; CHECK: %r0 = KILL %r0, implicit-def %r0_r1
	+ ; CHECK-NEXT: tBX_RET
	+ undef %0.gsub_0 = COPY %r0
	+ tBX_RET 14, _, implicit %0
	+
	+
	+...
	+
	+---
	+# Check that we properly recognize that r1 is not live through
	+# the first subreg copy. It is defined by this copy, but is not
	+# through.
	+# CHECK-LABEL: name: subregNotLiveThrough2
	+name: subregNotLiveThrough2
	+tracksRegLiveness: true
	+registers:
	+ - { id: 0, class: gprpair }
	+body: \|
	+ bb.0:
	+ liveins: %r0, %r1
	+
	+ ; r1 is not live through so check we are not implicitly using
	+ ; the big register.
	+ ; CHECK: %r0 = KILL %r0, implicit-def %r1, implicit-def %r0_r1
	+ ; CHECK-NEXT: tBX_RET
	+ undef %0.gsub_0 = COPY %r0, implicit-def %r1
	+ tBX_RET 14, _, implicit %0
	+
	+
	+...
	diff --git a/test/CodeGen/X86/adx-intrinsics.ll b/test/CodeGen/X86/adx-intrinsics.ll
	index 0498177a9c12..819a5df14e63 100644
	--- a/test/CodeGen/X86/adx-intrinsics.ll
	+++ b/test/CodeGen/X86/adx-intrinsics.ll
	@@ -1,77 +1,104 @@
	; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 --show-mc-encoding\| FileCheck %s --check-prefix=NOADX --check-prefix=CHECK
	; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=broadwell --show-mc-encoding\| FileCheck %s --check-prefix=ADX --check-prefix=CHECK

	declare i8 @llvm.x86.addcarryx.u32(i8, i32, i32, i8*)

	define i8 @test_addcarryx_u32(i8 %c, i32 %a, i32 %b, i8* %ptr) {
	; CHECK-LABEL: test_addcarryx_u32
	; CHECK: addb
	; ADX: adcxl
	; CHECK: setb
	; CHECK: retq
	%ret = tail call i8 @llvm.x86.addcarryx.u32(i8 %c, i32 %a, i32 %b, i8* %ptr)
	ret i8 %ret;
	}

	declare i8 @llvm.x86.addcarryx.u64(i8, i64, i64, i8*)

	define i8 @test_addcarryx_u64(i8 %c, i64 %a, i64 %b, i8* %ptr) {
	; CHECK-LABEL: test_addcarryx_u64
	; CHECK: addb
	; ADX: adcxq
	; CHECK: setb
	; CHECK: retq
	%ret = tail call i8 @llvm.x86.addcarryx.u64(i8 %c, i64 %a, i64 %b, i8* %ptr)
	ret i8 %ret;
	}

	declare i8 @llvm.x86.addcarry.u32(i8, i32, i32, i8*)

	define i8 @test_addcarry_u32(i8 %c, i32 %a, i32 %b, i8* %ptr) {
	; CHECK-LABEL: test_addcarry_u32
	; CHECK: addb
	; ADX: adcxl
	; NOADX: adcl
	; CHECK: setb
	; CHECK: retq
	%ret = tail call i8 @llvm.x86.addcarry.u32(i8 %c, i32 %a, i32 %b, i8* %ptr)
	ret i8 %ret;
	}

	declare i8 @llvm.x86.addcarry.u64(i8, i64, i64, i8*)

	define i8 @test_addcarry_u64(i8 %c, i64 %a, i64 %b, i8* %ptr) {
	; CHECK-LABEL: test_addcarry_u64
	; CHECK: addb
	; ADX: adcxq
	; NOADX: adcq
	; CHECK: setb
	; CHECK: retq
	%ret = tail call i8 @llvm.x86.addcarry.u64(i8 %c, i64 %a, i64 %b, i8* %ptr)
	ret i8 %ret;
	}

	declare i8 @llvm.x86.subborrow.u32(i8, i32, i32, i8*)

	define i8 @test_subborrow_u32(i8 %c, i32 %a, i32 %b, i8* %ptr) {
	; CHECK-LABEL: test_subborrow_u32
	; CHECK: addb
	; CHECK: sbbl
	; CHECK: setb
	; CHECK: retq
	%ret = tail call i8 @llvm.x86.subborrow.u32(i8 %c, i32 %a, i32 %b, i8* %ptr)
	ret i8 %ret;
	}

	declare i8 @llvm.x86.subborrow.u64(i8, i64, i64, i8*)

	define i8 @test_subborrow_u64(i8 %c, i64 %a, i64 %b, i8* %ptr) {
	; CHECK-LABEL: test_subborrow_u64
	; CHECK: addb
	; CHECK: sbbq
	; CHECK: setb
	; CHECK: retq
	%ret = tail call i8 @llvm.x86.subborrow.u64(i8 %c, i64 %a, i64 %b, i8* %ptr)
	ret i8 %ret;
	}

	+; Try a version with loads. Previously we crashed on this.
	+define i32 @load_crash(i64* nocapture readonly %a, i64* nocapture readonly %b, i64* %res) {
	+; CHECK-LABEL: load_crash
	+; CHECK: addb
	+; ADX: adcxq
	+; CHECK: setb
	+; CHECK: retq
	+ %1 = load i64, i64* %a, align 8
	+ %2 = load i64, i64* %b, align 8
	+ %3 = bitcast i64* %res to i8*
	+ %4 = tail call i8 @llvm.x86.addcarryx.u64(i8 0, i64 %1, i64 %2, i8* %3)
	+ %conv = zext i8 %4 to i32
	+ ret i32 %conv
	+}
	+
	+; Try a really simple all zero input case, which also used to crash
	+define void @allzeros() {
	+; CHECK-LABEL: allzeros
	+; CHECK: xorl
	+; CHECK: addb
	+; CHECK: sbbq
	+; CHECK: andl
	+; CHECK: retq
	+entry:
	+ %0 = tail call i8 @llvm.x86.addcarryx.u64(i8 0, i64 0, i64 0, i8* null)
	+ ret void
	+}
	diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll
	index 5472f057ef27..4abe3df9fc2a 100644
	--- a/test/CodeGen/X86/avx512bw-intrinsics.ll
	+++ b/test/CodeGen/X86/avx512bw-intrinsics.ll
	@@ -1,2292 +1,2292 @@
	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw \| FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW
	; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mcpu=knl -mattr=+avx512bw \| FileCheck %s --check-prefix=ALL --check-prefix=AVX512F-32

	define <32 x i16> @test_mask_packs_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) {
	; AVX512BW-LABEL: test_mask_packs_epi32_rr_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: vpackssdw %zmm1, %zmm0, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_mask_packs_epi32_rr_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: vpackssdw %zmm1, %zmm0, %zmm0
	; AVX512F-32-NEXT: retl
	%1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b)
	ret <32 x i16> %1
	}

	define <32 x i16> @test_mask_packs_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask) {
	; AVX512BW-LABEL: test_mask_packs_epi32_rrk_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %edi, %k1
	; AVX512BW-NEXT: vpackssdw %zmm1, %zmm0, %zmm2 {%k1}
	; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_mask_packs_epi32_rrk_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpackssdw %zmm1, %zmm0, %zmm2 {%k1}
	; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0
	; AVX512F-32-NEXT: retl
	%1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b)
	%2 = bitcast i32 %mask to <32 x i1>
	%3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru
	ret <32 x i16> %3
	}

	define <32 x i16> @test_mask_packs_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b, i32 %mask) {
	; AVX512BW-LABEL: test_mask_packs_epi32_rrkz_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %edi, %k1
	; AVX512BW-NEXT: vpackssdw %zmm1, %zmm0, %zmm0 {%k1} {z}
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_mask_packs_epi32_rrkz_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpackssdw %zmm1, %zmm0, %zmm0 {%k1} {z}
	; AVX512F-32-NEXT: retl
	%1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b)
	%2 = bitcast i32 %mask to <32 x i1>
	%3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer
	ret <32 x i16> %3
	}

	define <32 x i16> @test_mask_packs_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr_b) {
	; AVX512BW-LABEL: test_mask_packs_epi32_rm_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: vpackssdw (%rdi), %zmm0, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_mask_packs_epi32_rm_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
	; AVX512F-32-NEXT: vpackssdw (%eax), %zmm0, %zmm0
	; AVX512F-32-NEXT: retl
	%b = load <16 x i32>, <16 x i32>* %ptr_b
	%1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b)
	ret <32 x i16> %1
	}

	define <32 x i16> @test_mask_packs_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
	; AVX512BW-LABEL: test_mask_packs_epi32_rmk_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %esi, %k1
	; AVX512BW-NEXT: vpackssdw (%rdi), %zmm0, %zmm1 {%k1}
	; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_mask_packs_epi32_rmk_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpackssdw (%eax), %zmm0, %zmm1 {%k1}
	; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0
	; AVX512F-32-NEXT: retl
	%b = load <16 x i32>, <16 x i32>* %ptr_b
	%1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b)
	%2 = bitcast i32 %mask to <32 x i1>
	%3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru
	ret <32 x i16> %3
	}

	define <32 x i16> @test_mask_packs_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %ptr_b, i32 %mask) {
	; AVX512BW-LABEL: test_mask_packs_epi32_rmkz_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %esi, %k1
	; AVX512BW-NEXT: vpackssdw (%rdi), %zmm0, %zmm0 {%k1} {z}
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_mask_packs_epi32_rmkz_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpackssdw (%eax), %zmm0, %zmm0 {%k1} {z}
	; AVX512F-32-NEXT: retl
	%b = load <16 x i32>, <16 x i32>* %ptr_b
	%1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b)
	%2 = bitcast i32 %mask to <32 x i1>
	%3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer
	ret <32 x i16> %3
	}

	define <32 x i16> @test_mask_packs_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) {
	; AVX512BW-LABEL: test_mask_packs_epi32_rmb_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: vpackssdw (%rdi){1to16}, %zmm0, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_mask_packs_epi32_rmb_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
	; AVX512F-32-NEXT: vpackssdw (%eax){1to16}, %zmm0, %zmm0
	; AVX512F-32-NEXT: retl
	%q = load i32, i32* %ptr_b
	%vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
	%b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
	%1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b)
	ret <32 x i16> %1
	}

	define <32 x i16> @test_mask_packs_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <32 x i16> %passThru, i32 %mask) {
	; AVX512BW-LABEL: test_mask_packs_epi32_rmbk_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %esi, %k1
	; AVX512BW-NEXT: vpackssdw (%rdi){1to16}, %zmm0, %zmm1 {%k1}
	; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_mask_packs_epi32_rmbk_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpackssdw (%eax){1to16}, %zmm0, %zmm1 {%k1}
	; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0
	; AVX512F-32-NEXT: retl
	%q = load i32, i32* %ptr_b
	%vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
	%b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
	%1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b)
	%2 = bitcast i32 %mask to <32 x i1>
	%3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru
	ret <32 x i16> %3
	}

	define <32 x i16> @test_mask_packs_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i32 %mask) {
	; AVX512BW-LABEL: test_mask_packs_epi32_rmbkz_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %esi, %k1
	; AVX512BW-NEXT: vpackssdw (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z}
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_mask_packs_epi32_rmbkz_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpackssdw (%eax){1to16}, %zmm0, %zmm0 {%k1} {z}
	; AVX512F-32-NEXT: retl
	%q = load i32, i32* %ptr_b
	%vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
	%b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
	%1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b)
	%2 = bitcast i32 %mask to <32 x i1>
	%3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer
	ret <32 x i16> %3
	}

	declare <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32>, <16 x i32>)

	define <64 x i8> @test_mask_packs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
	; AVX512BW-LABEL: test_mask_packs_epi16_rr_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: vpacksswb %zmm1, %zmm0, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_mask_packs_epi16_rr_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: vpacksswb %zmm1, %zmm0, %zmm0
	; AVX512F-32-NEXT: retl
	%1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a, <32 x i16> %b)
	ret <64 x i8> %1
	}

	define <64 x i8> @test_mask_packs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask) {
	; AVX512BW-LABEL: test_mask_packs_epi16_rrk_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovq %rdi, %k1
	; AVX512BW-NEXT: vpacksswb %zmm1, %zmm0, %zmm2 {%k1}
	; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_mask_packs_epi16_rrk_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpacksswb %zmm1, %zmm0, %zmm2 {%k1}
	; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0
	; AVX512F-32-NEXT: retl
	%1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a, <32 x i16> %b)
	%2 = bitcast i64 %mask to <64 x i1>
	%3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passThru
	ret <64 x i8> %3
	}

	define <64 x i8> @test_mask_packs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i64 %mask) {
	; AVX512BW-LABEL: test_mask_packs_epi16_rrkz_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovq %rdi, %k1
	; AVX512BW-NEXT: vpacksswb %zmm1, %zmm0, %zmm0 {%k1} {z}
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_mask_packs_epi16_rrkz_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpacksswb %zmm1, %zmm0, %zmm0 {%k1} {z}
	; AVX512F-32-NEXT: retl
	%1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a, <32 x i16> %b)
	%2 = bitcast i64 %mask to <64 x i1>
	%3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> zeroinitializer
	ret <64 x i8> %3
	}

	define <64 x i8> @test_mask_packs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
	; AVX512BW-LABEL: test_mask_packs_epi16_rm_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: vpacksswb (%rdi), %zmm0, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_mask_packs_epi16_rm_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
	; AVX512F-32-NEXT: vpacksswb (%eax), %zmm0, %zmm0
	; AVX512F-32-NEXT: retl
	%b = load <32 x i16>, <32 x i16>* %ptr_b
	%1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a, <32 x i16> %b)
	ret <64 x i8> %1
	}

	define <64 x i8> @test_mask_packs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <64 x i8> %passThru, i64 %mask) {
	; AVX512BW-LABEL: test_mask_packs_epi16_rmk_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovq %rsi, %k1
	; AVX512BW-NEXT: vpacksswb (%rdi), %zmm0, %zmm1 {%k1}
	; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_mask_packs_epi16_rmk_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
	; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpacksswb (%eax), %zmm0, %zmm1 {%k1}
	; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0
	; AVX512F-32-NEXT: retl
	%b = load <32 x i16>, <32 x i16>* %ptr_b
	%1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a, <32 x i16> %b)
	%2 = bitcast i64 %mask to <64 x i1>
	%3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passThru
	ret <64 x i8> %3
	}

	define <64 x i8> @test_mask_packs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i64 %mask) {
	; AVX512BW-LABEL: test_mask_packs_epi16_rmkz_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovq %rsi, %k1
	; AVX512BW-NEXT: vpacksswb (%rdi), %zmm0, %zmm0 {%k1} {z}
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_mask_packs_epi16_rmkz_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
	; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpacksswb (%eax), %zmm0, %zmm0 {%k1} {z}
	; AVX512F-32-NEXT: retl
	%b = load <32 x i16>, <32 x i16>* %ptr_b
	%1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a, <32 x i16> %b)
	%2 = bitcast i64 %mask to <64 x i1>
	%3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> zeroinitializer
	ret <64 x i8> %3
	}

	declare <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16>, <32 x i16>)


	define <32 x i16> @test_mask_packus_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) {
	; AVX512BW-LABEL: test_mask_packus_epi32_rr_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: vpackusdw %zmm1, %zmm0, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_mask_packus_epi32_rr_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: vpackusdw %zmm1, %zmm0, %zmm0
	; AVX512F-32-NEXT: retl
	%1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b)
	ret <32 x i16> %1
	}

	define <32 x i16> @test_mask_packus_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask) {
	; AVX512BW-LABEL: test_mask_packus_epi32_rrk_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %edi, %k1
	; AVX512BW-NEXT: vpackusdw %zmm1, %zmm0, %zmm2 {%k1}
	; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_mask_packus_epi32_rrk_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpackusdw %zmm1, %zmm0, %zmm2 {%k1}
	; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0
	; AVX512F-32-NEXT: retl
	%1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b)
	%2 = bitcast i32 %mask to <32 x i1>
	%3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru
	ret <32 x i16> %3
	}

	define <32 x i16> @test_mask_packus_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b, i32 %mask) {
	; AVX512BW-LABEL: test_mask_packus_epi32_rrkz_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %edi, %k1
	; AVX512BW-NEXT: vpackusdw %zmm1, %zmm0, %zmm0 {%k1} {z}
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_mask_packus_epi32_rrkz_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpackusdw %zmm1, %zmm0, %zmm0 {%k1} {z}
	; AVX512F-32-NEXT: retl
	%1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b)
	%2 = bitcast i32 %mask to <32 x i1>
	%3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer
	ret <32 x i16> %3
	}

	define <32 x i16> @test_mask_packus_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr_b) {
	; AVX512BW-LABEL: test_mask_packus_epi32_rm_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: vpackusdw (%rdi), %zmm0, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_mask_packus_epi32_rm_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
	; AVX512F-32-NEXT: vpackusdw (%eax), %zmm0, %zmm0
	; AVX512F-32-NEXT: retl
	%b = load <16 x i32>, <16 x i32>* %ptr_b
	%1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b)
	ret <32 x i16> %1
	}

	define <32 x i16> @test_mask_packus_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
	; AVX512BW-LABEL: test_mask_packus_epi32_rmk_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %esi, %k1
	; AVX512BW-NEXT: vpackusdw (%rdi), %zmm0, %zmm1 {%k1}
	; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_mask_packus_epi32_rmk_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpackusdw (%eax), %zmm0, %zmm1 {%k1}
	; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0
	; AVX512F-32-NEXT: retl
	%b = load <16 x i32>, <16 x i32>* %ptr_b
	%1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b)
	%2 = bitcast i32 %mask to <32 x i1>
	%3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru
	ret <32 x i16> %3
	}

	define <32 x i16> @test_mask_packus_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %ptr_b, i32 %mask) {
	; AVX512BW-LABEL: test_mask_packus_epi32_rmkz_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %esi, %k1
	; AVX512BW-NEXT: vpackusdw (%rdi), %zmm0, %zmm0 {%k1} {z}
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_mask_packus_epi32_rmkz_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpackusdw (%eax), %zmm0, %zmm0 {%k1} {z}
	; AVX512F-32-NEXT: retl
	%b = load <16 x i32>, <16 x i32>* %ptr_b
	%1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b)
	%2 = bitcast i32 %mask to <32 x i1>
	%3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer
	ret <32 x i16> %3
	}

	define <32 x i16> @test_mask_packus_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) {
	; AVX512BW-LABEL: test_mask_packus_epi32_rmb_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: vpackusdw (%rdi){1to16}, %zmm0, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_mask_packus_epi32_rmb_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
	; AVX512F-32-NEXT: vpackusdw (%eax){1to16}, %zmm0, %zmm0
	; AVX512F-32-NEXT: retl
	%q = load i32, i32* %ptr_b
	%vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
	%b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
	%1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b)
	ret <32 x i16> %1
	}

	define <32 x i16> @test_mask_packus_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <32 x i16> %passThru, i32 %mask) {
	; AVX512BW-LABEL: test_mask_packus_epi32_rmbk_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %esi, %k1
	; AVX512BW-NEXT: vpackusdw (%rdi){1to16}, %zmm0, %zmm1 {%k1}
	; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_mask_packus_epi32_rmbk_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpackusdw (%eax){1to16}, %zmm0, %zmm1 {%k1}
	; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0
	; AVX512F-32-NEXT: retl
	%q = load i32, i32* %ptr_b
	%vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
	%b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
	%1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b)
	%2 = bitcast i32 %mask to <32 x i1>
	%3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru
	ret <32 x i16> %3
	}

	define <32 x i16> @test_mask_packus_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i32 %mask) {
	; AVX512BW-LABEL: test_mask_packus_epi32_rmbkz_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %esi, %k1
	; AVX512BW-NEXT: vpackusdw (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z}
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_mask_packus_epi32_rmbkz_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpackusdw (%eax){1to16}, %zmm0, %zmm0 {%k1} {z}
	; AVX512F-32-NEXT: retl
	%q = load i32, i32* %ptr_b
	%vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
	%b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
	%1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b)
	%2 = bitcast i32 %mask to <32 x i1>
	%3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer
	ret <32 x i16> %3
	}

	declare <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32>, <16 x i32>)

	define <64 x i8> @test_mask_packus_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
	; AVX512BW-LABEL: test_mask_packus_epi16_rr_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_mask_packus_epi16_rr_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
	; AVX512F-32-NEXT: retl
	%1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a, <32 x i16> %b)
	ret <64 x i8> %1
	}

	define <64 x i8> @test_mask_packus_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask) {
	; AVX512BW-LABEL: test_mask_packus_epi16_rrk_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovq %rdi, %k1
	; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm2 {%k1}
	; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_mask_packus_epi16_rrk_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpackuswb %zmm1, %zmm0, %zmm2 {%k1}
	; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0
	; AVX512F-32-NEXT: retl
	%1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a, <32 x i16> %b)
	%2 = bitcast i64 %mask to <64 x i1>
	%3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passThru
	ret <64 x i8> %3
	}

	define <64 x i8> @test_mask_packus_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i64 %mask) {
	; AVX512BW-LABEL: test_mask_packus_epi16_rrkz_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovq %rdi, %k1
	; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 {%k1} {z}
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_mask_packus_epi16_rrkz_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 {%k1} {z}
	; AVX512F-32-NEXT: retl
	%1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a, <32 x i16> %b)
	%2 = bitcast i64 %mask to <64 x i1>
	%3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> zeroinitializer
	ret <64 x i8> %3
	}

	define <64 x i8> @test_mask_packus_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
	; AVX512BW-LABEL: test_mask_packus_epi16_rm_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: vpackuswb (%rdi), %zmm0, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_mask_packus_epi16_rm_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
	; AVX512F-32-NEXT: vpackuswb (%eax), %zmm0, %zmm0
	; AVX512F-32-NEXT: retl
	%b = load <32 x i16>, <32 x i16>* %ptr_b
	%1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a, <32 x i16> %b)
	ret <64 x i8> %1
	}

	define <64 x i8> @test_mask_packus_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <64 x i8> %passThru, i64 %mask) {
	; AVX512BW-LABEL: test_mask_packus_epi16_rmk_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovq %rsi, %k1
	; AVX512BW-NEXT: vpackuswb (%rdi), %zmm0, %zmm1 {%k1}
	; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_mask_packus_epi16_rmk_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
	; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpackuswb (%eax), %zmm0, %zmm1 {%k1}
	; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0
	; AVX512F-32-NEXT: retl
	%b = load <32 x i16>, <32 x i16>* %ptr_b
	%1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a, <32 x i16> %b)
	%2 = bitcast i64 %mask to <64 x i1>
	%3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passThru
	ret <64 x i8> %3
	}

	define <64 x i8> @test_mask_packus_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i64 %mask) {
	; AVX512BW-LABEL: test_mask_packus_epi16_rmkz_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovq %rsi, %k1
	; AVX512BW-NEXT: vpackuswb (%rdi), %zmm0, %zmm0 {%k1} {z}
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_mask_packus_epi16_rmkz_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
	; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpackuswb (%eax), %zmm0, %zmm0 {%k1} {z}
	; AVX512F-32-NEXT: retl
	%b = load <32 x i16>, <32 x i16>* %ptr_b
	%1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a, <32 x i16> %b)
	%2 = bitcast i64 %mask to <64 x i1>
	%3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> zeroinitializer
	ret <64 x i8> %3
	}

	declare <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16>, <32 x i16>)

	define <32 x i16> @test_mask_adds_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
	; AVX512BW-LABEL: test_mask_adds_epi16_rr_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: vpaddsw %zmm1, %zmm0, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_mask_adds_epi16_rr_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: vpaddsw %zmm1, %zmm0, %zmm0
	; AVX512F-32-NEXT: retl
	%res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
	ret <32 x i16> %res
	}

	define <32 x i16> @test_mask_adds_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) {
	; AVX512BW-LABEL: test_mask_adds_epi16_rrk_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %edi, %k1
	; AVX512BW-NEXT: vpaddsw %zmm1, %zmm0, %zmm2 {%k1}
	; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_mask_adds_epi16_rrk_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpaddsw %zmm1, %zmm0, %zmm2 {%k1}
	; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0
	; AVX512F-32-NEXT: retl
	%res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
	ret <32 x i16> %res
	}

	define <32 x i16> @test_mask_adds_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
	; AVX512BW-LABEL: test_mask_adds_epi16_rrkz_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %edi, %k1
	; AVX512BW-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 {%k1} {z}
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_mask_adds_epi16_rrkz_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 {%k1} {z}
	; AVX512F-32-NEXT: retl
	%res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
	ret <32 x i16> %res
	}

	define <32 x i16> @test_mask_adds_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
	; AVX512BW-LABEL: test_mask_adds_epi16_rm_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: vpaddsw (%rdi), %zmm0, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_mask_adds_epi16_rm_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
	; AVX512F-32-NEXT: vpaddsw (%eax), %zmm0, %zmm0
	; AVX512F-32-NEXT: retl
	%b = load <32 x i16>, <32 x i16>* %ptr_b
	%res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
	ret <32 x i16> %res
	}

	define <32 x i16> @test_mask_adds_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
	; AVX512BW-LABEL: test_mask_adds_epi16_rmk_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %esi, %k1
	; AVX512BW-NEXT: vpaddsw (%rdi), %zmm0, %zmm1 {%k1}
	; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_mask_adds_epi16_rmk_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpaddsw (%eax), %zmm0, %zmm1 {%k1}
	; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0
	; AVX512F-32-NEXT: retl
	%b = load <32 x i16>, <32 x i16>* %ptr_b
	%res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
	ret <32 x i16> %res
	}

	define <32 x i16> @test_mask_adds_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) {
	; AVX512BW-LABEL: test_mask_adds_epi16_rmkz_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %esi, %k1
	; AVX512BW-NEXT: vpaddsw (%rdi), %zmm0, %zmm0 {%k1} {z}
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_mask_adds_epi16_rmkz_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpaddsw (%eax), %zmm0, %zmm0 {%k1} {z}
	; AVX512F-32-NEXT: retl
	%b = load <32 x i16>, <32 x i16>* %ptr_b
	%res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
	ret <32 x i16> %res
	}

	declare <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)

	define <32 x i16> @test_mask_subs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
	; AVX512BW-LABEL: test_mask_subs_epi16_rr_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: vpsubsw %zmm1, %zmm0, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_mask_subs_epi16_rr_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: vpsubsw %zmm1, %zmm0, %zmm0
	; AVX512F-32-NEXT: retl
	%res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
	ret <32 x i16> %res
	}

	define <32 x i16> @test_mask_subs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) {
	; AVX512BW-LABEL: test_mask_subs_epi16_rrk_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %edi, %k1
	; AVX512BW-NEXT: vpsubsw %zmm1, %zmm0, %zmm2 {%k1}
	; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_mask_subs_epi16_rrk_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpsubsw %zmm1, %zmm0, %zmm2 {%k1}
	; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0
	; AVX512F-32-NEXT: retl
	%res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
	ret <32 x i16> %res
	}

	define <32 x i16> @test_mask_subs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
	; AVX512BW-LABEL: test_mask_subs_epi16_rrkz_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %edi, %k1
	; AVX512BW-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 {%k1} {z}
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_mask_subs_epi16_rrkz_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 {%k1} {z}
	; AVX512F-32-NEXT: retl
	%res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
	ret <32 x i16> %res
	}

	define <32 x i16> @test_mask_subs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
	; AVX512BW-LABEL: test_mask_subs_epi16_rm_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: vpsubsw (%rdi), %zmm0, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_mask_subs_epi16_rm_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
	; AVX512F-32-NEXT: vpsubsw (%eax), %zmm0, %zmm0
	; AVX512F-32-NEXT: retl
	%b = load <32 x i16>, <32 x i16>* %ptr_b
	%res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
	ret <32 x i16> %res
	}

	define <32 x i16> @test_mask_subs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
	; AVX512BW-LABEL: test_mask_subs_epi16_rmk_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %esi, %k1
	; AVX512BW-NEXT: vpsubsw (%rdi), %zmm0, %zmm1 {%k1}
	; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_mask_subs_epi16_rmk_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpsubsw (%eax), %zmm0, %zmm1 {%k1}
	; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0
	; AVX512F-32-NEXT: retl
	%b = load <32 x i16>, <32 x i16>* %ptr_b
	%res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
	ret <32 x i16> %res
	}

	define <32 x i16> @test_mask_subs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) {
	; AVX512BW-LABEL: test_mask_subs_epi16_rmkz_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %esi, %k1
	; AVX512BW-NEXT: vpsubsw (%rdi), %zmm0, %zmm0 {%k1} {z}
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_mask_subs_epi16_rmkz_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpsubsw (%eax), %zmm0, %zmm0 {%k1} {z}
	; AVX512F-32-NEXT: retl
	%b = load <32 x i16>, <32 x i16>* %ptr_b
	%res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
	ret <32 x i16> %res
	}

	declare <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)

	define <32 x i16> @test_mask_adds_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) {
	; AVX512BW-LABEL: test_mask_adds_epu16_rr_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: vpaddusw %zmm1, %zmm0, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_mask_adds_epu16_rr_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: vpaddusw %zmm1, %zmm0, %zmm0
	; AVX512F-32-NEXT: retl
	%res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
	ret <32 x i16> %res
	}

	define <32 x i16> @test_mask_adds_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) {
	; AVX512BW-LABEL: test_mask_adds_epu16_rrk_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %edi, %k1
	; AVX512BW-NEXT: vpaddusw %zmm1, %zmm0, %zmm2 {%k1}
	; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_mask_adds_epu16_rrk_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpaddusw %zmm1, %zmm0, %zmm2 {%k1}
	; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0
	; AVX512F-32-NEXT: retl
	%res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
	ret <32 x i16> %res
	}

	define <32 x i16> @test_mask_adds_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
	; AVX512BW-LABEL: test_mask_adds_epu16_rrkz_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %edi, %k1
	; AVX512BW-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 {%k1} {z}
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_mask_adds_epu16_rrkz_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 {%k1} {z}
	; AVX512F-32-NEXT: retl
	%res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
	ret <32 x i16> %res
	}

	define <32 x i16> @test_mask_adds_epu16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
	; AVX512BW-LABEL: test_mask_adds_epu16_rm_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: vpaddusw (%rdi), %zmm0, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_mask_adds_epu16_rm_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
	; AVX512F-32-NEXT: vpaddusw (%eax), %zmm0, %zmm0
	; AVX512F-32-NEXT: retl
	%b = load <32 x i16>, <32 x i16>* %ptr_b
	%res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
	ret <32 x i16> %res
	}

	define <32 x i16> @test_mask_adds_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
	; AVX512BW-LABEL: test_mask_adds_epu16_rmk_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %esi, %k1
	; AVX512BW-NEXT: vpaddusw (%rdi), %zmm0, %zmm1 {%k1}
	; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_mask_adds_epu16_rmk_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpaddusw (%eax), %zmm0, %zmm1 {%k1}
	; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0
	; AVX512F-32-NEXT: retl
	%b = load <32 x i16>, <32 x i16>* %ptr_b
	%res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
	ret <32 x i16> %res
	}

	define <32 x i16> @test_mask_adds_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) {
	; AVX512BW-LABEL: test_mask_adds_epu16_rmkz_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %esi, %k1
	; AVX512BW-NEXT: vpaddusw (%rdi), %zmm0, %zmm0 {%k1} {z}
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_mask_adds_epu16_rmkz_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpaddusw (%eax), %zmm0, %zmm0 {%k1} {z}
	; AVX512F-32-NEXT: retl
	%b = load <32 x i16>, <32 x i16>* %ptr_b
	%res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
	ret <32 x i16> %res
	}

	declare <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)

	define <32 x i16> @test_mask_subs_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) {
	; AVX512BW-LABEL: test_mask_subs_epu16_rr_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: vpsubusw %zmm1, %zmm0, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_mask_subs_epu16_rr_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: vpsubusw %zmm1, %zmm0, %zmm0
	; AVX512F-32-NEXT: retl
	%res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
	ret <32 x i16> %res
	}

	define <32 x i16> @test_mask_subs_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) {
	; AVX512BW-LABEL: test_mask_subs_epu16_rrk_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %edi, %k1
	; AVX512BW-NEXT: vpsubusw %zmm1, %zmm0, %zmm2 {%k1}
	; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_mask_subs_epu16_rrk_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpsubusw %zmm1, %zmm0, %zmm2 {%k1}
	; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0
	; AVX512F-32-NEXT: retl
	%res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
	ret <32 x i16> %res
	}

	define <32 x i16> @test_mask_subs_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
	; AVX512BW-LABEL: test_mask_subs_epu16_rrkz_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %edi, %k1
	; AVX512BW-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 {%k1} {z}
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_mask_subs_epu16_rrkz_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 {%k1} {z}
	; AVX512F-32-NEXT: retl
	%res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
	ret <32 x i16> %res
	}

	define <32 x i16> @test_mask_subs_epu16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
	; AVX512BW-LABEL: test_mask_subs_epu16_rm_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: vpsubusw (%rdi), %zmm0, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_mask_subs_epu16_rm_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
	; AVX512F-32-NEXT: vpsubusw (%eax), %zmm0, %zmm0
	; AVX512F-32-NEXT: retl
	%b = load <32 x i16>, <32 x i16>* %ptr_b
	%res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
	ret <32 x i16> %res
	}

	define <32 x i16> @test_mask_subs_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
	; AVX512BW-LABEL: test_mask_subs_epu16_rmk_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %esi, %k1
	; AVX512BW-NEXT: vpsubusw (%rdi), %zmm0, %zmm1 {%k1}
	; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_mask_subs_epu16_rmk_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpsubusw (%eax), %zmm0, %zmm1 {%k1}
	; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0
	; AVX512F-32-NEXT: retl
	%b = load <32 x i16>, <32 x i16>* %ptr_b
	%res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
	ret <32 x i16> %res
	}

	define <32 x i16> @test_mask_subs_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) {
	; AVX512BW-LABEL: test_mask_subs_epu16_rmkz_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %esi, %k1
	; AVX512BW-NEXT: vpsubusw (%rdi), %zmm0, %zmm0 {%k1} {z}
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_mask_subs_epu16_rmkz_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpsubusw (%eax), %zmm0, %zmm0 {%k1} {z}
	; AVX512F-32-NEXT: retl
	%b = load <32 x i16>, <32 x i16>* %ptr_b
	%res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
	ret <32 x i16> %res
	}

	declare <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)

	declare <32 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)

	define <32 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
	; AVX512BW-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %edi, %k1
	; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3
	; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm3 {%k1}
	; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm1
	; AVX512BW-NEXT: vpaddw %zmm1, %zmm3, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm3
	; AVX512F-32-NEXT: vpermt2w %zmm2, %zmm0, %zmm3 {%k1}
	; AVX512F-32-NEXT: vpermt2w %zmm2, %zmm0, %zmm1
	; AVX512F-32-NEXT: vpaddw %zmm1, %zmm3, %zmm0
	; AVX512F-32-NEXT: retl
	%res = call <32 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
	%res1 = call <32 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
	%res2 = add <32 x i16> %res, %res1
	ret <32 x i16> %res2
	}

	declare <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)

	define <32 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
	; AVX512BW-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %edi, %k1
	; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3
	; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm3 {%k1} {z}
	; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm1
	; AVX512BW-NEXT: vpaddw %zmm1, %zmm3, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm3
	; AVX512F-32-NEXT: vpermt2w %zmm2, %zmm0, %zmm3 {%k1} {z}
	; AVX512F-32-NEXT: vpermt2w %zmm2, %zmm0, %zmm1
	; AVX512F-32-NEXT: vpaddw %zmm1, %zmm3, %zmm0
	; AVX512F-32-NEXT: retl
	%res = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
	%res1 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
	%res2 = add <32 x i16> %res, %res1
	ret <32 x i16> %res2
	}

	declare <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)

	define <32 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
	; AVX512BW-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %edi, %k1
	; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3
	; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm3 {%k1}
	; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1
	; AVX512BW-NEXT: vpaddw %zmm1, %zmm3, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm3
	; AVX512F-32-NEXT: vpermi2w %zmm2, %zmm0, %zmm3 {%k1}
	; AVX512F-32-NEXT: vpermi2w %zmm2, %zmm0, %zmm1
	; AVX512F-32-NEXT: vpaddw %zmm1, %zmm3, %zmm0
	; AVX512F-32-NEXT: retl
	%res = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
	%res1 = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
	%res2 = add <32 x i16> %res, %res1
	ret <32 x i16> %res2
	}

	declare <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)

	define <64 x i8>@test_int_x86_avx512_mask_pavg_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
	; AVX512BW-LABEL: test_int_x86_avx512_mask_pavg_b_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovq %rdi, %k1
	; AVX512BW-NEXT: vpavgb %zmm1, %zmm0, %zmm2 {%k1}
	; AVX512BW-NEXT: vpavgb %zmm1, %zmm0, %zmm0
	; AVX512BW-NEXT: vpaddb %zmm0, %zmm2, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_int_x86_avx512_mask_pavg_b_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k1
	; AVX512F-32-NEXT: vpavgb %zmm1, %zmm0, %zmm2 {%k1}
	; AVX512F-32-NEXT: vpavgb %zmm1, %zmm0, %zmm0
	; AVX512F-32-NEXT: vpaddb %zmm0, %zmm2, %zmm0
	; AVX512F-32-NEXT: retl
	%res = call <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
	%res1 = call <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
	%res2 = add <64 x i8> %res, %res1
	ret <64 x i8> %res2
	}

	declare <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)

	define <32 x i16>@test_int_x86_avx512_mask_pavg_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
	; AVX512BW-LABEL: test_int_x86_avx512_mask_pavg_w_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %edi, %k1
	; AVX512BW-NEXT: vpavgw %zmm1, %zmm0, %zmm2 {%k1}
	; AVX512BW-NEXT: vpavgw %zmm1, %zmm0, %zmm0
	; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_int_x86_avx512_mask_pavg_w_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpavgw %zmm1, %zmm0, %zmm2 {%k1}
	; AVX512F-32-NEXT: vpavgw %zmm1, %zmm0, %zmm0
	; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0
	; AVX512F-32-NEXT: retl
	%res = call <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
	%res1 = call <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
	%res2 = add <32 x i16> %res, %res1
	ret <32 x i16> %res2
	}

	declare <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8>, <64 x i8>)

	define <64 x i8>@test_int_x86_avx512_pshuf_b_512(<64 x i8> %x0, <64 x i8> %x1) {
	; AVX512BW-LABEL: test_int_x86_avx512_pshuf_b_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: vpshufb %zmm1, %zmm0, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_int_x86_avx512_pshuf_b_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: vpshufb %zmm1, %zmm0, %zmm0
	; AVX512F-32-NEXT: retl
	%res = call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %x0, <64 x i8> %x1)
	ret <64 x i8> %res
	}

	define <64 x i8>@test_int_x86_avx512_pshuf_b_512_mask(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %mask) {
	; AVX512BW-LABEL: test_int_x86_avx512_pshuf_b_512_mask:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovq %rdi, %k1
	; AVX512BW-NEXT: vpshufb %zmm1, %zmm0, %zmm2 {%k1}
	; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_int_x86_avx512_pshuf_b_512_mask:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpshufb %zmm1, %zmm0, %zmm2 {%k1}
	; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0
	; AVX512F-32-NEXT: retl
	%res = call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %x0, <64 x i8> %x1)
	%mask.cast = bitcast i64 %mask to <64 x i1>
	%res2 = select <64 x i1> %mask.cast, <64 x i8> %res, <64 x i8> %x2
	ret <64 x i8> %res2
	}

	define <64 x i8>@test_int_x86_avx512_pshuf_b_512_maskz(<64 x i8> %x0, <64 x i8> %x1, i64 %mask) {
	; AVX512BW-LABEL: test_int_x86_avx512_pshuf_b_512_maskz:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovq %rdi, %k1
	; AVX512BW-NEXT: vpshufb %zmm1, %zmm0, %zmm0 {%k1} {z}
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_int_x86_avx512_pshuf_b_512_maskz:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpshufb %zmm1, %zmm0, %zmm0 {%k1} {z}
	; AVX512F-32-NEXT: retl
	%res = call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %x0, <64 x i8> %x1)
	%mask.cast = bitcast i64 %mask to <64 x i1>
	%res2 = select <64 x i1> %mask.cast, <64 x i8> %res, <64 x i8> zeroinitializer
	ret <64 x i8> %res2
	}

	declare <32 x i16> @llvm.x86.avx512.mask.pabs.w.512(<32 x i16>, <32 x i16>, i32)

	define <32 x i16>@test_int_x86_avx512_mask_pabs_w_512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2) {
	; AVX512BW-LABEL: test_int_x86_avx512_mask_pabs_w_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %edi, %k1
	; AVX512BW-NEXT: vpabsw %zmm0, %zmm1 {%k1}
	; AVX512BW-NEXT: vpabsw %zmm0, %zmm0
	; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_int_x86_avx512_mask_pabs_w_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpabsw %zmm0, %zmm1 {%k1}
	; AVX512F-32-NEXT: vpabsw %zmm0, %zmm0
	; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
	; AVX512F-32-NEXT: retl
	%res = call <32 x i16> @llvm.x86.avx512.mask.pabs.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2)
	%res1 = call <32 x i16> @llvm.x86.avx512.mask.pabs.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 -1)
	%res2 = add <32 x i16> %res, %res1
	ret <32 x i16> %res2
	}

	declare <64 x i8> @llvm.x86.avx512.mask.pabs.b.512(<64 x i8>, <64 x i8>, i64)

	define <64 x i8>@test_int_x86_avx512_mask_pabs_b_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2) {
	; AVX512BW-LABEL: test_int_x86_avx512_mask_pabs_b_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovq %rdi, %k1
	; AVX512BW-NEXT: vpabsb %zmm0, %zmm1 {%k1}
	; AVX512BW-NEXT: vpabsb %zmm0, %zmm0
	; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_int_x86_avx512_mask_pabs_b_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k1
	; AVX512F-32-NEXT: vpabsb %zmm0, %zmm1 {%k1}
	; AVX512F-32-NEXT: vpabsb %zmm0, %zmm0
	; AVX512F-32-NEXT: vpaddb %zmm0, %zmm1, %zmm0
	; AVX512F-32-NEXT: retl
	%res = call <64 x i8> @llvm.x86.avx512.mask.pabs.b.512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2)
	%res1 = call <64 x i8> @llvm.x86.avx512.mask.pabs.b.512(<64 x i8> %x0, <64 x i8> %x1, i64 -1)
	%res2 = add <64 x i8> %res, %res1
	ret <64 x i8> %res2
	}

	declare <32 x i16> @llvm.x86.avx512.mask.pmulhu.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)

	define <32 x i16>@test_int_x86_avx512_mask_pmulhu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
	; AVX512BW-LABEL: test_int_x86_avx512_mask_pmulhu_w_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %edi, %k1
	; AVX512BW-NEXT: vpmulhuw %zmm1, %zmm0, %zmm2 {%k1}
	; AVX512BW-NEXT: vpmulhuw %zmm1, %zmm0, %zmm0
	; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmulhu_w_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpmulhuw %zmm1, %zmm0, %zmm2 {%k1}
	; AVX512F-32-NEXT: vpmulhuw %zmm1, %zmm0, %zmm0
	; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0
	; AVX512F-32-NEXT: retl
	%res = call <32 x i16> @llvm.x86.avx512.mask.pmulhu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
	%res1 = call <32 x i16> @llvm.x86.avx512.mask.pmulhu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
	%res2 = add <32 x i16> %res, %res1
	ret <32 x i16> %res2
	}

	declare <32 x i16> @llvm.x86.avx512.mask.pmulh.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)

	define <32 x i16>@test_int_x86_avx512_mask_pmulh_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
	; AVX512BW-LABEL: test_int_x86_avx512_mask_pmulh_w_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %edi, %k1
	; AVX512BW-NEXT: vpmulhw %zmm1, %zmm0, %zmm2 {%k1}
	; AVX512BW-NEXT: vpmulhw %zmm1, %zmm0, %zmm0
	; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmulh_w_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpmulhw %zmm1, %zmm0, %zmm2 {%k1}
	; AVX512F-32-NEXT: vpmulhw %zmm1, %zmm0, %zmm0
	; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0
	; AVX512F-32-NEXT: retl
	%res = call <32 x i16> @llvm.x86.avx512.mask.pmulh.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
	%res1 = call <32 x i16> @llvm.x86.avx512.mask.pmulh.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
	%res2 = add <32 x i16> %res, %res1
	ret <32 x i16> %res2
	}

	declare <32 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)

	define <32 x i16>@test_int_x86_avx512_mask_pmulhr_sw_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
	; AVX512BW-LABEL: test_int_x86_avx512_mask_pmulhr_sw_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %edi, %k1
	; AVX512BW-NEXT: vpmulhrsw %zmm1, %zmm0, %zmm2 {%k1}
	; AVX512BW-NEXT: vpmulhrsw %zmm1, %zmm0, %zmm0
	; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmulhr_sw_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpmulhrsw %zmm1, %zmm0, %zmm2 {%k1}
	; AVX512F-32-NEXT: vpmulhrsw %zmm1, %zmm0, %zmm0
	; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0
	; AVX512F-32-NEXT: retl
	%res = call <32 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
	%res1 = call <32 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
	%res2 = add <32 x i16> %res, %res1
	ret <32 x i16> %res2
	}

	declare <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16>, <32 x i8>, i32)

	define <32 x i8>@test_int_x86_avx512_mask_pmov_wb_512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) {
	; AVX512BW-LABEL: test_int_x86_avx512_mask_pmov_wb_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %edi, %k1
	; AVX512BW-NEXT: vpmovwb %zmm0, %ymm1 {%k1}
	; AVX512BW-NEXT: vpmovwb %zmm0, %ymm2 {%k1} {z}
	; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
	; AVX512BW-NEXT: vpaddb %ymm1, %ymm0, %ymm0
	; AVX512BW-NEXT: vpaddb %ymm2, %ymm0, %ymm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmov_wb_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpmovwb %zmm0, %ymm1 {%k1}
	; AVX512F-32-NEXT: vpmovwb %zmm0, %ymm2 {%k1} {z}
	; AVX512F-32-NEXT: vpmovwb %zmm0, %ymm0
	; AVX512F-32-NEXT: vpaddb %ymm1, %ymm0, %ymm0
	; AVX512F-32-NEXT: vpaddb %ymm2, %ymm0, %ymm0
	; AVX512F-32-NEXT: retl
	%res0 = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 -1)
	%res1 = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2)
	%res2 = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %x0, <32 x i8> zeroinitializer, i32 %x2)
	%res3 = add <32 x i8> %res0, %res1
	%res4 = add <32 x i8> %res3, %res2
	ret <32 x i8> %res4
	}

	declare void @llvm.x86.avx512.mask.pmov.wb.mem.512(i8* %ptr, <32 x i16>, i32)

	define void @test_int_x86_avx512_mask_pmov_wb_mem_512(i8* %ptr, <32 x i16> %x1, i32 %x2) {
	; AVX512BW-LABEL: test_int_x86_avx512_mask_pmov_wb_mem_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %esi, %k1
	; AVX512BW-NEXT: vpmovwb %zmm0, (%rdi)
	; AVX512BW-NEXT: vpmovwb %zmm0, (%rdi) {%k1}
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmov_wb_mem_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
	; AVX512F-32-NEXT: vpmovwb %zmm0, (%eax)
	; AVX512F-32-NEXT: vpmovwb %zmm0, (%eax) {%k1}
	; AVX512F-32-NEXT: retl
	call void @llvm.x86.avx512.mask.pmov.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 -1)
	call void @llvm.x86.avx512.mask.pmov.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 %x2)
	ret void
	}

	declare <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16>, <32 x i8>, i32)

	define <32 x i8>@test_int_x86_avx512_mask_pmovs_wb_512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) {
	; AVX512BW-LABEL: test_int_x86_avx512_mask_pmovs_wb_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %edi, %k1
	; AVX512BW-NEXT: vpmovswb %zmm0, %ymm1 {%k1}
	; AVX512BW-NEXT: vpmovswb %zmm0, %ymm2 {%k1} {z}
	; AVX512BW-NEXT: vpmovswb %zmm0, %ymm0
	; AVX512BW-NEXT: vpaddb %ymm1, %ymm0, %ymm0
	; AVX512BW-NEXT: vpaddb %ymm2, %ymm0, %ymm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmovs_wb_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpmovswb %zmm0, %ymm1 {%k1}
	; AVX512F-32-NEXT: vpmovswb %zmm0, %ymm2 {%k1} {z}
	; AVX512F-32-NEXT: vpmovswb %zmm0, %ymm0
	; AVX512F-32-NEXT: vpaddb %ymm1, %ymm0, %ymm0
	; AVX512F-32-NEXT: vpaddb %ymm2, %ymm0, %ymm0
	; AVX512F-32-NEXT: retl
	%res0 = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 -1)
	%res1 = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2)
	%res2 = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %x0, <32 x i8> zeroinitializer, i32 %x2)
	%res3 = add <32 x i8> %res0, %res1
	%res4 = add <32 x i8> %res3, %res2
	ret <32 x i8> %res4
	}

	declare void @llvm.x86.avx512.mask.pmovs.wb.mem.512(i8* %ptr, <32 x i16>, i32)

	define void @test_int_x86_avx512_mask_pmovs_wb_mem_512(i8* %ptr, <32 x i16> %x1, i32 %x2) {
	; AVX512BW-LABEL: test_int_x86_avx512_mask_pmovs_wb_mem_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %esi, %k1
	; AVX512BW-NEXT: vpmovswb %zmm0, (%rdi)
	; AVX512BW-NEXT: vpmovswb %zmm0, (%rdi) {%k1}
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmovs_wb_mem_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
	; AVX512F-32-NEXT: vpmovswb %zmm0, (%eax)
	; AVX512F-32-NEXT: vpmovswb %zmm0, (%eax) {%k1}
	; AVX512F-32-NEXT: retl
	call void @llvm.x86.avx512.mask.pmovs.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 -1)
	call void @llvm.x86.avx512.mask.pmovs.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 %x2)
	ret void
	}

	declare <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16>, <32 x i8>, i32)

	define <32 x i8>@test_int_x86_avx512_mask_pmovus_wb_512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) {
	; AVX512BW-LABEL: test_int_x86_avx512_mask_pmovus_wb_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %edi, %k1
	; AVX512BW-NEXT: vpmovuswb %zmm0, %ymm1 {%k1}
	; AVX512BW-NEXT: vpmovuswb %zmm0, %ymm2 {%k1} {z}
	; AVX512BW-NEXT: vpmovuswb %zmm0, %ymm0
	; AVX512BW-NEXT: vpaddb %ymm1, %ymm0, %ymm0
	; AVX512BW-NEXT: vpaddb %ymm2, %ymm0, %ymm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmovus_wb_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpmovuswb %zmm0, %ymm1 {%k1}
	; AVX512F-32-NEXT: vpmovuswb %zmm0, %ymm2 {%k1} {z}
	; AVX512F-32-NEXT: vpmovuswb %zmm0, %ymm0
	; AVX512F-32-NEXT: vpaddb %ymm1, %ymm0, %ymm0
	; AVX512F-32-NEXT: vpaddb %ymm2, %ymm0, %ymm0
	; AVX512F-32-NEXT: retl
	%res0 = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 -1)
	%res1 = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2)
	%res2 = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %x0, <32 x i8> zeroinitializer, i32 %x2)
	%res3 = add <32 x i8> %res0, %res1
	%res4 = add <32 x i8> %res3, %res2
	ret <32 x i8> %res4
	}

	declare void @llvm.x86.avx512.mask.pmovus.wb.mem.512(i8* %ptr, <32 x i16>, i32)

	define void @test_int_x86_avx512_mask_pmovus_wb_mem_512(i8* %ptr, <32 x i16> %x1, i32 %x2) {
	; AVX512BW-LABEL: test_int_x86_avx512_mask_pmovus_wb_mem_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %esi, %k1
	; AVX512BW-NEXT: vpmovuswb %zmm0, (%rdi)
	; AVX512BW-NEXT: vpmovuswb %zmm0, (%rdi) {%k1}
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmovus_wb_mem_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
	; AVX512F-32-NEXT: vpmovuswb %zmm0, (%eax)
	; AVX512F-32-NEXT: vpmovuswb %zmm0, (%eax) {%k1}
	; AVX512F-32-NEXT: retl
	call void @llvm.x86.avx512.mask.pmovus.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 -1)
	call void @llvm.x86.avx512.mask.pmovus.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 %x2)
	ret void
	}

	declare <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8>, <64 x i8>, <32 x i16>, i32)

	define <32 x i16>@test_int_x86_avx512_mask_pmaddubs_w_512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 %x3) {
	; AVX512BW-LABEL: test_int_x86_avx512_mask_pmaddubs_w_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %edi, %k1
	; AVX512BW-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm2 {%k1}
	; AVX512BW-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0
	; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmaddubs_w_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm2 {%k1}
	; AVX512F-32-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0
	; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0
	; AVX512F-32-NEXT: retl
	%res = call <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 %x3)
	%res1 = call <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 -1)
	%res2 = add <32 x i16> %res, %res1
	ret <32 x i16> %res2
	}

	declare <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16>, <32 x i16>, <16 x i32>, i16)

	define <16 x i32>@test_int_x86_avx512_mask_pmaddw_d_512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2, i16 %x3) {
	; AVX512BW-LABEL: test_int_x86_avx512_mask_pmaddw_d_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %edi, %k1
	; AVX512BW-NEXT: vpmaddwd %zmm1, %zmm0, %zmm2 {%k1}
	; AVX512BW-NEXT: vpmaddwd %zmm1, %zmm0, %zmm0
	; AVX512BW-NEXT: vpaddd %zmm0, %zmm2, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmaddw_d_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpmaddwd %zmm1, %zmm0, %zmm2 {%k1}
	; AVX512F-32-NEXT: vpmaddwd %zmm1, %zmm0, %zmm0
	; AVX512F-32-NEXT: vpaddd %zmm0, %zmm2, %zmm0
	; AVX512F-32-NEXT: retl
	%res = call <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2, i16 %x3)
	%res1 = call <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2, i16 -1)
	%res2 = add <16 x i32> %res, %res1
	ret <16 x i32> %res2
	}

	declare <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8>, <64 x i8>, i32, <32 x i16>, i32)

	define <32 x i16>@test_int_x86_avx512_mask_dbpsadbw_512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x3, i32 %x4) {
	; AVX512BW-LABEL: test_int_x86_avx512_mask_dbpsadbw_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %edi, %k1
	; AVX512BW-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm2 {%k1}
	; AVX512BW-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm3 {%k1} {z}
	; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm2
	; AVX512BW-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm0
	; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_int_x86_avx512_mask_dbpsadbw_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm2 {%k1}
	; AVX512F-32-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm3 {%k1} {z}
	; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm2
	; AVX512F-32-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm0
	; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0
	; AVX512F-32-NEXT: retl
	%res = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <32 x i16> %x3, i32 %x4)
	%res1 = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <32 x i16> zeroinitializer, i32 %x4)
	%res2 = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <32 x i16> %x3, i32 -1)
	%res3 = add <32 x i16> %res, %res1
	%res4 = add <32 x i16> %res3, %res2
	ret <32 x i16> %res4
	}

	declare <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8>, <64 x i8>)

	define <8 x i64>@test_int_x86_avx512_mask_psadb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2){
	; AVX512BW-LABEL: test_int_x86_avx512_mask_psadb_w_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm1
	; AVX512BW-NEXT: vpsadbw %zmm2, %zmm0, %zmm0
	; AVX512BW-NEXT: vpaddq %zmm0, %zmm1, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_int_x86_avx512_mask_psadb_w_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: vpsadbw %zmm1, %zmm0, %zmm1
	; AVX512F-32-NEXT: vpsadbw %zmm2, %zmm0, %zmm0
	; AVX512F-32-NEXT: vpaddq %zmm0, %zmm1, %zmm0
	; AVX512F-32-NEXT: retl
	%res = call <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8> %x0, <64 x i8> %x1)
	%res1 = call <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8> %x0, <64 x i8> %x2)
	%res2 = add <8 x i64> %res, %res1
	ret <8 x i64> %res2
	}

	declare i32 @llvm.x86.avx512.kunpck.wd(i32, i32)

	define i32@test_int_x86_avx512_kunpck_wd(i32 %x0, i32 %x1) {
	; AVX512BW-LABEL: test_int_x86_avx512_kunpck_wd:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %edi, %k0
	; AVX512BW-NEXT: kmovd %esi, %k1
	; AVX512BW-NEXT: kunpckwd %k1, %k0, %k0
	; AVX512BW-NEXT: kmovd %k0, %eax
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_wd:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: kmovw {{[0-9]+}}(%esp), %k0
	; AVX512F-32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: kunpckwd %k0, %k1, %k0
	; AVX512F-32-NEXT: kmovd %k0, %eax
	; AVX512F-32-NEXT: retl
	%res = call i32 @llvm.x86.avx512.kunpck.wd(i32 %x0, i32 %x1)
	ret i32 %res
	}

	declare i64 @llvm.x86.avx512.kunpck.dq(i64, i64)

	define i64@test_int_x86_avx512_kunpck_qd(i64 %x0, i64 %x1) {
	; AVX512BW-LABEL: test_int_x86_avx512_kunpck_qd:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovq %rdi, %k0
	; AVX512BW-NEXT: kmovq %rsi, %k1
	; AVX512BW-NEXT: kunpckdq %k1, %k0, %k0
	; AVX512BW-NEXT: kmovq %k0, %rax
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_qd:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: subl $12, %esp
	; AVX512F-32-NEXT: .Lcfi0:
	; AVX512F-32-NEXT: .cfi_def_cfa_offset 16
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k0
	; AVX512F-32-NEXT: kmovq %k0, (%esp)
	; AVX512F-32-NEXT: movl (%esp), %eax
	; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
	; AVX512F-32-NEXT: addl $12, %esp
	; AVX512F-32-NEXT: retl
	%res = call i64 @llvm.x86.avx512.kunpck.dq(i64 %x0, i64 %x1)
	ret i64 %res
	}

	declare i64 @llvm.x86.avx512.cvtb2mask.512(<64 x i8>)

	define i64@test_int_x86_avx512_cvtb2mask_512(<64 x i8> %x0) {
	; AVX512BW-LABEL: test_int_x86_avx512_cvtb2mask_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: vpmovb2m %zmm0, %k0
	; AVX512BW-NEXT: kmovq %k0, %rax
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_int_x86_avx512_cvtb2mask_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: subl $12, %esp
	; AVX512F-32-NEXT: .Lcfi1:
	; AVX512F-32-NEXT: .cfi_def_cfa_offset 16
	; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
	; AVX512F-32-NEXT: kmovq %k0, (%esp)
	; AVX512F-32-NEXT: movl (%esp), %eax
	; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
	; AVX512F-32-NEXT: addl $12, %esp
	; AVX512F-32-NEXT: retl
	%res = call i64 @llvm.x86.avx512.cvtb2mask.512(<64 x i8> %x0)
	ret i64 %res
	}

	declare i32 @llvm.x86.avx512.cvtw2mask.512(<32 x i16>)

	define i32@test_int_x86_avx512_cvtw2mask_512(<32 x i16> %x0) {
	; AVX512BW-LABEL: test_int_x86_avx512_cvtw2mask_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: vpmovw2m %zmm0, %k0
	; AVX512BW-NEXT: kmovd %k0, %eax
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_int_x86_avx512_cvtw2mask_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: vpmovw2m %zmm0, %k0
	; AVX512F-32-NEXT: kmovd %k0, %eax
	; AVX512F-32-NEXT: retl
	%res = call i32 @llvm.x86.avx512.cvtw2mask.512(<32 x i16> %x0)
	ret i32 %res
	}

	declare <32 x i16> @llvm.x86.avx512.mask.psrlv32hi(<32 x i16>, <32 x i16>, <32 x i16>, i32)

	define <32 x i16>@test_int_x86_avx512_mask_psrlv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
	; AVX512BW-LABEL: test_int_x86_avx512_mask_psrlv32hi:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm3
	; AVX512BW-NEXT: kmovd %edi, %k1
	; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm2 {%k1}
	; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 {%k1} {z}
	; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0
	; AVX512BW-NEXT: vpaddw %zmm3, %zmm0, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_int_x86_avx512_mask_psrlv32hi:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: vpsrlvw %zmm1, %zmm0, %zmm3
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpsrlvw %zmm1, %zmm0, %zmm2 {%k1}
	; AVX512F-32-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 {%k1} {z}
	; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0
	; AVX512F-32-NEXT: vpaddw %zmm3, %zmm0, %zmm0
	; AVX512F-32-NEXT: retl
	%res = call <32 x i16> @llvm.x86.avx512.mask.psrlv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
	%res1 = call <32 x i16> @llvm.x86.avx512.mask.psrlv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3)
	%res2 = call <32 x i16> @llvm.x86.avx512.mask.psrlv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
	%res3 = add <32 x i16> %res, %res1
	%res4 = add <32 x i16> %res3, %res2
	ret <32 x i16> %res4
	}

	declare <32 x i16> @llvm.x86.avx512.mask.psrav32.hi(<32 x i16>, <32 x i16>, <32 x i16>, i32)

	define <32 x i16>@test_int_x86_avx512_mask_psrav32_hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
	; AVX512BW-LABEL: test_int_x86_avx512_mask_psrav32_hi:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm3
	; AVX512BW-NEXT: kmovd %edi, %k1
	; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm2 {%k1}
	; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 {%k1} {z}
	; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0
	; AVX512BW-NEXT: vpaddw %zmm3, %zmm0, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_int_x86_avx512_mask_psrav32_hi:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: vpsravw %zmm1, %zmm0, %zmm3
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpsravw %zmm1, %zmm0, %zmm2 {%k1}
	; AVX512F-32-NEXT: vpsravw %zmm1, %zmm0, %zmm0 {%k1} {z}
	; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0
	; AVX512F-32-NEXT: vpaddw %zmm3, %zmm0, %zmm0
	; AVX512F-32-NEXT: retl
	%res = call <32 x i16> @llvm.x86.avx512.mask.psrav32.hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
	%res1 = call <32 x i16> @llvm.x86.avx512.mask.psrav32.hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3)
	%res2 = call <32 x i16> @llvm.x86.avx512.mask.psrav32.hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
	%res3 = add <32 x i16> %res, %res1
	%res4 = add <32 x i16> %res3, %res2
	ret <32 x i16> %res4
	}

	define <32 x i16>@test_int_x86_avx512_mask_psrav32_hi_const(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
	; AVX512BW-LABEL: test_int_x86_avx512_mask_psrav32_hi_const:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: vmovdqu16 {{.*#+}} zmm0 = [2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51]
	; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %zmm0, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_int_x86_avx512_mask_psrav32_hi_const:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: vmovdqu16 {{.*#+}} zmm0 = [2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51]
	; AVX512F-32-NEXT: vpsravw {{\.LCPI.*}}, %zmm0, %zmm0
	; AVX512F-32-NEXT: retl
	%res = call <32 x i16> @llvm.x86.avx512.mask.psrav32.hi(<32 x i16> <i16 2, i16 9, i16 -12, i16 23, i16 -26, i16 37, i16 -40, i16 51, i16 2, i16 9, i16 -12, i16 23, i16 -26, i16 37, i16 -40, i16 51, i16 2, i16 9, i16 -12, i16 23, i16 -26, i16 37, i16 -40, i16 51, i16 2, i16 9, i16 -12, i16 23, i16 -26, i16 37, i16 -40, i16 51>,
	<32 x i16> <i16 1, i16 10, i16 35, i16 52, i16 69, i16 9, i16 16, i16 49, i16 1, i16 10, i16 35, i16 52, i16 69, i16 9, i16 16, i16 49, i16 1, i16 10, i16 35, i16 52, i16 69, i16 9, i16 16, i16 49, i16 1, i16 10, i16 35, i16 52, i16 69, i16 9, i16 16, i16 49>,
	<32 x i16> zeroinitializer, i32 -1)
	ret <32 x i16> %res
	}

	declare <32 x i16> @llvm.x86.avx512.mask.psllv32hi(<32 x i16>, <32 x i16>, <32 x i16>, i32)

	define <32 x i16>@test_int_x86_avx512_mask_psllv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
	; AVX512BW-LABEL: test_int_x86_avx512_mask_psllv32hi:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm3
	; AVX512BW-NEXT: kmovd %edi, %k1
	; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2 {%k1}
	; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 {%k1} {z}
	; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0
	; AVX512BW-NEXT: vpaddw %zmm3, %zmm0, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_int_x86_avx512_mask_psllv32hi:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: vpsllvw %zmm1, %zmm0, %zmm3
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpsllvw %zmm1, %zmm0, %zmm2 {%k1}
	; AVX512F-32-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 {%k1} {z}
	; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0
	; AVX512F-32-NEXT: vpaddw %zmm3, %zmm0, %zmm0
	; AVX512F-32-NEXT: retl
	%res = call <32 x i16> @llvm.x86.avx512.mask.psllv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
	%res1 = call <32 x i16> @llvm.x86.avx512.mask.psllv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3)
	%res2 = call <32 x i16> @llvm.x86.avx512.mask.psllv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
	%res3 = add <32 x i16> %res, %res1
	%res4 = add <32 x i16> %res3, %res2
	ret <32 x i16> %res4
	}

	declare <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)

	define <32 x i16>@test_int_x86_avx512_mask_permvar_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
	; AVX512BW-LABEL: test_int_x86_avx512_mask_permvar_hi_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %edi, %k1
	; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm2 {%k1}
	; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm3 {%k1} {z}
	; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm2
	; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0
	; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_int_x86_avx512_mask_permvar_hi_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpermw %zmm0, %zmm1, %zmm2 {%k1}
	; AVX512F-32-NEXT: vpermw %zmm0, %zmm1, %zmm3 {%k1} {z}
	; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm2
	; AVX512F-32-NEXT: vpermw %zmm0, %zmm1, %zmm0
	; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0
	; AVX512F-32-NEXT: retl
	%res = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
	%res1 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3)
	%res2 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
	%res3 = add <32 x i16> %res, %res1
	%res4 = add <32 x i16> %res3, %res2
	ret <32 x i16> %res4
	}

	declare i64 @llvm.x86.avx512.ptestm.b.512(<64 x i8>, <64 x i8>, i64)

	define i64@test_int_x86_avx512_ptestm_b_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2) {
	; AVX512BW-LABEL: test_int_x86_avx512_ptestm_b_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovq %rdi, %k1
	; AVX512BW-NEXT: vptestmb %zmm1, %zmm0, %k0 {%k1}
	; AVX512BW-NEXT: kmovq %k0, %rcx
	; AVX512BW-NEXT: vptestmb %zmm1, %zmm0, %k0
	; AVX512BW-NEXT: kmovq %k0, %rax
	; AVX512BW-NEXT: addq %rcx, %rax
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_int_x86_avx512_ptestm_b_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: subl $20, %esp
	; AVX512F-32-NEXT: .Lcfi2:
	; AVX512F-32-NEXT: .cfi_def_cfa_offset 24
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k1
	; AVX512F-32-NEXT: vptestmb %zmm1, %zmm0, %k0 {%k1}
	; AVX512F-32-NEXT: kmovq %k0, (%esp)
	; AVX512F-32-NEXT: vptestmb %zmm1, %zmm0, %k0
	; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
	; AVX512F-32-NEXT: movl (%esp), %eax
	; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
	; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
	; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
	; AVX512F-32-NEXT: addl $20, %esp
	; AVX512F-32-NEXT: retl
	%res = call i64 @llvm.x86.avx512.ptestm.b.512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2)
	%res1 = call i64 @llvm.x86.avx512.ptestm.b.512(<64 x i8> %x0, <64 x i8> %x1, i64-1)
	%res2 = add i64 %res, %res1
	ret i64 %res2
	}

	declare i32 @llvm.x86.avx512.ptestm.w.512(<32 x i16>, <32 x i16>, i32)

	define i32@test_int_x86_avx512_ptestm_w_512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2) {
	; AVX512BW-LABEL: test_int_x86_avx512_ptestm_w_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %edi, %k1
	; AVX512BW-NEXT: vptestmw %zmm1, %zmm0, %k0 {%k1}
	; AVX512BW-NEXT: kmovd %k0, %ecx
	; AVX512BW-NEXT: vptestmw %zmm1, %zmm0, %k0
	; AVX512BW-NEXT: kmovd %k0, %eax
	; AVX512BW-NEXT: addl %ecx, %eax
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_int_x86_avx512_ptestm_w_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vptestmw %zmm1, %zmm0, %k0 {%k1}
	; AVX512F-32-NEXT: kmovd %k0, %ecx
	; AVX512F-32-NEXT: vptestmw %zmm1, %zmm0, %k0
	; AVX512F-32-NEXT: kmovd %k0, %eax
	; AVX512F-32-NEXT: addl %ecx, %eax
	; AVX512F-32-NEXT: retl
	%res = call i32 @llvm.x86.avx512.ptestm.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2)
	%res1 = call i32 @llvm.x86.avx512.ptestm.w.512(<32 x i16> %x0, <32 x i16> %x1, i32-1)
	%res2 = add i32 %res, %res1
	ret i32 %res2
	}

	declare i64 @llvm.x86.avx512.ptestnm.b.512(<64 x i8>, <64 x i8>, i64 %x2)

	define i64@test_int_x86_avx512_ptestnm_b_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2) {
	; AVX512BW-LABEL: test_int_x86_avx512_ptestnm_b_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovq %rdi, %k1
	; AVX512BW-NEXT: vptestnmb %zmm1, %zmm0, %k0 {%k1}
	; AVX512BW-NEXT: kmovq %k0, %rcx
	; AVX512BW-NEXT: vptestnmb %zmm1, %zmm0, %k0
	; AVX512BW-NEXT: kmovq %k0, %rax
	; AVX512BW-NEXT: addq %rcx, %rax
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_int_x86_avx512_ptestnm_b_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: subl $20, %esp
	; AVX512F-32-NEXT: .Lcfi3:
	; AVX512F-32-NEXT: .cfi_def_cfa_offset 24
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k1
	; AVX512F-32-NEXT: vptestnmb %zmm1, %zmm0, %k0 {%k1}
	; AVX512F-32-NEXT: kmovq %k0, (%esp)
	; AVX512F-32-NEXT: vptestnmb %zmm1, %zmm0, %k0
	; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
	; AVX512F-32-NEXT: movl (%esp), %eax
	; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
	; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
	; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
	; AVX512F-32-NEXT: addl $20, %esp
	; AVX512F-32-NEXT: retl
	%res = call i64 @llvm.x86.avx512.ptestnm.b.512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2)
	%res1 = call i64 @llvm.x86.avx512.ptestnm.b.512(<64 x i8> %x0, <64 x i8> %x1, i64-1)
	%res2 = add i64 %res, %res1
	ret i64 %res2
	}

	declare i32 @llvm.x86.avx512.ptestnm.w.512(<32 x i16>, <32 x i16>, i32 %x2)

	define i32@test_int_x86_avx512_ptestnm_w_512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2) {
	; AVX512BW-LABEL: test_int_x86_avx512_ptestnm_w_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %edi, %k1
	; AVX512BW-NEXT: vptestnmw %zmm1, %zmm0, %k0 {%k1}
	; AVX512BW-NEXT: kmovd %k0, %ecx
	; AVX512BW-NEXT: vptestnmw %zmm1, %zmm0, %k0
	; AVX512BW-NEXT: kmovd %k0, %eax
	; AVX512BW-NEXT: addl %ecx, %eax
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_int_x86_avx512_ptestnm_w_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vptestnmw %zmm1, %zmm0, %k0 {%k1}
	; AVX512F-32-NEXT: kmovd %k0, %ecx
	; AVX512F-32-NEXT: vptestnmw %zmm1, %zmm0, %k0
	; AVX512F-32-NEXT: kmovd %k0, %eax
	; AVX512F-32-NEXT: addl %ecx, %eax
	; AVX512F-32-NEXT: retl
	%res = call i32 @llvm.x86.avx512.ptestnm.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2)
	%res1 = call i32 @llvm.x86.avx512.ptestnm.w.512(<32 x i16> %x0, <32 x i16> %x1, i32-1)
	%res2 = add i32 %res, %res1
	ret i32 %res2
	}

	declare <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8, <64 x i8>, i64)

	define <64 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_512(i8 %x0, <64 x i8> %x1, i64 %mask) {
	; AVX512BW-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovq %rsi, %k1
	-; AVX512BW-NEXT: vpbroadcastb %dil, %zmm0 {%k1}
	-; AVX512BW-NEXT: vpbroadcastb %dil, %zmm1 {%k1} {z}
	-; AVX512BW-NEXT: vpbroadcastb %dil, %zmm2
	+; AVX512BW-NEXT: vpbroadcastb %edi, %zmm1 {%k1} {z}
	+; AVX512BW-NEXT: vpbroadcastb %edi, %zmm0 {%k1}
	+; AVX512BW-NEXT: vpbroadcastb %edi, %zmm2
	; AVX512BW-NEXT: vpaddb %zmm0, %zmm2, %zmm0
	; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: movb {{[0-9]+}}(%esp), %al
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k1
	-; AVX512F-32-NEXT: vpbroadcastb %al, %zmm1 {%k1} {z}
	-; AVX512F-32-NEXT: vpbroadcastb %al, %zmm0 {%k1}
	-; AVX512F-32-NEXT: vpbroadcastb %al, %zmm2
	+; AVX512F-32-NEXT: vpbroadcastb %eax, %zmm1 {%k1} {z}
	+; AVX512F-32-NEXT: vpbroadcastb %eax, %zmm0 {%k1}
	+; AVX512F-32-NEXT: vpbroadcastb %eax, %zmm2
	; AVX512F-32-NEXT: vpaddb %zmm0, %zmm2, %zmm0
	; AVX512F-32-NEXT: vpaddb %zmm0, %zmm1, %zmm0
	; AVX512F-32-NEXT: retl
	%res = call <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8 %x0, <64 x i8> %x1, i64 -1)
	%res1 = call <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8 %x0, <64 x i8> %x1, i64 %mask)
	%res2 = call <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8 %x0, <64 x i8> zeroinitializer, i64 %mask)
	%res3 = add <64 x i8> %res, %res1
	%res4 = add <64 x i8> %res2, %res3
	ret <64 x i8> %res4
	}

	declare <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16, <32 x i16>, i32)

	define <32 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_512(i16 %x0, <32 x i16> %x1, i32 %mask) {
	; AVX512BW-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %esi, %k1
	-; AVX512BW-NEXT: vpbroadcastw %di, %zmm0 {%k1}
	-; AVX512BW-NEXT: vpbroadcastw %di, %zmm1 {%k1} {z}
	-; AVX512BW-NEXT: vpbroadcastw %di, %zmm2
	+; AVX512BW-NEXT: vpbroadcastw %edi, %zmm1 {%k1} {z}
	+; AVX512BW-NEXT: vpbroadcastw %edi, %zmm0 {%k1}
	+; AVX512BW-NEXT: vpbroadcastw %edi, %zmm2
	; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0
	; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_512:
	; AVX512F-32: # BB#0:
	-; AVX512F-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	-; AVX512F-32-NEXT: vpbroadcastw %ax, %zmm0 {%k1}
	-; AVX512F-32-NEXT: vpbroadcastw %ax, %zmm1 {%k1} {z}
	-; AVX512F-32-NEXT: vpbroadcastw %ax, %zmm2
	+; AVX512F-32-NEXT: movw {{[0-9]+}}(%esp), %ax
	+; AVX512F-32-NEXT: vpbroadcastw %eax, %zmm1 {%k1} {z}
	+; AVX512F-32-NEXT: vpbroadcastw %eax, %zmm0 {%k1}
	+; AVX512F-32-NEXT: vpbroadcastw %eax, %zmm2
	; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0
	; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
	; AVX512F-32-NEXT: retl
	%res = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> %x1, i32 -1)
	%res1 = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> %x1, i32 %mask)
	%res2 = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> zeroinitializer, i32 %mask)
	%res3 = add <32 x i16> %res, %res1
	%res4 = add <32 x i16> %res2, %res3
	ret <32 x i16> %res4
	}


	define <32 x i16> @test_x86_avx512_psll_w_512(<32 x i16> %a0, <8 x i16> %a1) {
	; AVX512BW-LABEL: test_x86_avx512_psll_w_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_x86_avx512_psll_w_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: vpsllw %xmm1, %zmm0, %zmm0
	; AVX512F-32-NEXT: retl
	%res = call <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16> %a0, <8 x i16> %a1) ; <<32 x i16>> [#uses=1]
	ret <32 x i16> %res
	}
	define <32 x i16> @test_x86_avx512_mask_psll_w_512(<32 x i16> %a0, <8 x i16> %a1, <32 x i16> %passthru, i32 %mask) {
	; AVX512BW-LABEL: test_x86_avx512_mask_psll_w_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %edi, %k1
	; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm2 {%k1}
	; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_x86_avx512_mask_psll_w_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpsllw %xmm1, %zmm0, %zmm2 {%k1}
	; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0
	; AVX512F-32-NEXT: retl
	%res = call <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16> %a0, <8 x i16> %a1) ; <<32 x i16>> [#uses=1]
	%mask.cast = bitcast i32 %mask to <32 x i1>
	%res2 = select <32 x i1> %mask.cast, <32 x i16> %res, <32 x i16> %passthru
	ret <32 x i16> %res2
	}
	define <32 x i16> @test_x86_avx512_maskz_psll_w_512(<32 x i16> %a0, <8 x i16> %a1, i32 %mask) {
	; AVX512BW-LABEL: test_x86_avx512_maskz_psll_w_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %edi, %k1
	; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm0 {%k1} {z}
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_x86_avx512_maskz_psll_w_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpsllw %xmm1, %zmm0, %zmm0 {%k1} {z}
	; AVX512F-32-NEXT: retl
	%res = call <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16> %a0, <8 x i16> %a1) ; <<32 x i16>> [#uses=1]
	%mask.cast = bitcast i32 %mask to <32 x i1>
	%res2 = select <32 x i1> %mask.cast, <32 x i16> %res, <32 x i16> zeroinitializer
	ret <32 x i16> %res2
	}
	declare <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16>, <8 x i16>) nounwind readnone


	define <32 x i16> @test_x86_avx512_pslli_w_512(<32 x i16> %a0) {
	; AVX512BW-LABEL: test_x86_avx512_pslli_w_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: vpsllw $7, %zmm0, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_x86_avx512_pslli_w_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: vpsllw $7, %zmm0, %zmm0
	; AVX512F-32-NEXT: retl
	%res = call <32 x i16> @llvm.x86.avx512.pslli.w.512(<32 x i16> %a0, i32 7) ; <<32 x i16>> [#uses=1]
	ret <32 x i16> %res
	}
	define <32 x i16> @test_x86_avx512_mask_pslli_w_512(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) {
	; AVX512BW-LABEL: test_x86_avx512_mask_pslli_w_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %edi, %k1
	; AVX512BW-NEXT: vpsllw $7, %zmm0, %zmm1 {%k1}
	; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_x86_avx512_mask_pslli_w_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpsllw $7, %zmm0, %zmm1 {%k1}
	; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0
	; AVX512F-32-NEXT: retl
	%res = call <32 x i16> @llvm.x86.avx512.pslli.w.512(<32 x i16> %a0, i32 7) ; <<32 x i16>> [#uses=1]
	%mask.cast = bitcast i32 %mask to <32 x i1>
	%res2 = select <32 x i1> %mask.cast, <32 x i16> %res, <32 x i16> %passthru
	ret <32 x i16> %res2
	}
	define <32 x i16> @test_x86_avx512_maskz_pslli_w_512(<32 x i16> %a0, i32 %mask) {
	; AVX512BW-LABEL: test_x86_avx512_maskz_pslli_w_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %edi, %k1
	; AVX512BW-NEXT: vpsllw $7, %zmm0, %zmm0 {%k1} {z}
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_x86_avx512_maskz_pslli_w_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpsllw $7, %zmm0, %zmm0 {%k1} {z}
	; AVX512F-32-NEXT: retl
	%res = call <32 x i16> @llvm.x86.avx512.pslli.w.512(<32 x i16> %a0, i32 7) ; <<32 x i16>> [#uses=1]
	%mask.cast = bitcast i32 %mask to <32 x i1>
	%res2 = select <32 x i1> %mask.cast, <32 x i16> %res, <32 x i16> zeroinitializer
	ret <32 x i16> %res2
	}
	declare <32 x i16> @llvm.x86.avx512.pslli.w.512(<32 x i16>, i32) nounwind readnone


	define <32 x i16> @test_x86_avx512_psra_w_512(<32 x i16> %a0, <8 x i16> %a1) {
	; AVX512BW-LABEL: test_x86_avx512_psra_w_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: vpsraw %xmm1, %zmm0, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_x86_avx512_psra_w_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: vpsraw %xmm1, %zmm0, %zmm0
	; AVX512F-32-NEXT: retl
	%res = call <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16> %a0, <8 x i16> %a1) ; <<32 x i16>> [#uses=1]
	ret <32 x i16> %res
	}
	define <32 x i16> @test_x86_avx512_mask_psra_w_512(<32 x i16> %a0, <8 x i16> %a1, <32 x i16> %passthru, i32 %mask) {
	; AVX512BW-LABEL: test_x86_avx512_mask_psra_w_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %edi, %k1
	; AVX512BW-NEXT: vpsraw %xmm1, %zmm0, %zmm2 {%k1}
	; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_x86_avx512_mask_psra_w_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpsraw %xmm1, %zmm0, %zmm2 {%k1}
	; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0
	; AVX512F-32-NEXT: retl
	%res = call <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16> %a0, <8 x i16> %a1) ; <<32 x i16>> [#uses=1]
	%mask.cast = bitcast i32 %mask to <32 x i1>
	%res2 = select <32 x i1> %mask.cast, <32 x i16> %res, <32 x i16> %passthru
	ret <32 x i16> %res2
	}
	define <32 x i16> @test_x86_avx512_maskz_psra_w_512(<32 x i16> %a0, <8 x i16> %a1, i32 %mask) {
	; AVX512BW-LABEL: test_x86_avx512_maskz_psra_w_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %edi, %k1
	; AVX512BW-NEXT: vpsraw %xmm1, %zmm0, %zmm0 {%k1} {z}
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_x86_avx512_maskz_psra_w_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpsraw %xmm1, %zmm0, %zmm0 {%k1} {z}
	; AVX512F-32-NEXT: retl
	%res = call <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16> %a0, <8 x i16> %a1) ; <<32 x i16>> [#uses=1]
	%mask.cast = bitcast i32 %mask to <32 x i1>
	%res2 = select <32 x i1> %mask.cast, <32 x i16> %res, <32 x i16> zeroinitializer
	ret <32 x i16> %res2
	}
	declare <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16>, <8 x i16>) nounwind readnone


	define <32 x i16> @test_x86_avx512_psrai_w_512(<32 x i16> %a0) {
	; AVX512BW-LABEL: test_x86_avx512_psrai_w_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: vpsraw $7, %zmm0, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_x86_avx512_psrai_w_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: vpsraw $7, %zmm0, %zmm0
	; AVX512F-32-NEXT: retl
	%res = call <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16> %a0, i32 7) ; <<32 x i16>> [#uses=1]
	ret <32 x i16> %res
	}
	define <32 x i16> @test_x86_avx512_mask_psrai_w_512(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) {
	; AVX512BW-LABEL: test_x86_avx512_mask_psrai_w_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %edi, %k1
	; AVX512BW-NEXT: vpsraw $7, %zmm0, %zmm1 {%k1}
	; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_x86_avx512_mask_psrai_w_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpsraw $7, %zmm0, %zmm1 {%k1}
	; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0
	; AVX512F-32-NEXT: retl
	%res = call <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16> %a0, i32 7) ; <<32 x i16>> [#uses=1]
	%mask.cast = bitcast i32 %mask to <32 x i1>
	%res2 = select <32 x i1> %mask.cast, <32 x i16> %res, <32 x i16> %passthru
	ret <32 x i16> %res2
	}
	define <32 x i16> @test_x86_avx512_maskz_psrai_w_512(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) {
	; AVX512BW-LABEL: test_x86_avx512_maskz_psrai_w_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %edi, %k1
	; AVX512BW-NEXT: vpsraw $7, %zmm0, %zmm0 {%k1} {z}
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_x86_avx512_maskz_psrai_w_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpsraw $7, %zmm0, %zmm0 {%k1} {z}
	; AVX512F-32-NEXT: retl
	%res = call <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16> %a0, i32 7) ; <<32 x i16>> [#uses=1]
	%mask.cast = bitcast i32 %mask to <32 x i1>
	%res2 = select <32 x i1> %mask.cast, <32 x i16> %res, <32 x i16> zeroinitializer
	ret <32 x i16> %res2
	}
	declare <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16>, i32) nounwind readnone


	define <32 x i16> @test_x86_avx512_psrl_w_512(<32 x i16> %a0, <8 x i16> %a1) {
	; AVX512BW-LABEL: test_x86_avx512_psrl_w_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_x86_avx512_psrl_w_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
	; AVX512F-32-NEXT: retl
	%res = call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> %a0, <8 x i16> %a1) ; <<32 x i16>> [#uses=1]
	ret <32 x i16> %res
	}
	define <32 x i16> @test_x86_avx512_mask_psrl_w_512(<32 x i16> %a0, <8 x i16> %a1, <32 x i16> %passthru, i32 %mask) {
	; AVX512BW-LABEL: test_x86_avx512_mask_psrl_w_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %edi, %k1
	; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm2 {%k1}
	; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_x86_avx512_mask_psrl_w_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpsrlw %xmm1, %zmm0, %zmm2 {%k1}
	; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0
	; AVX512F-32-NEXT: retl
	%res = call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> %a0, <8 x i16> %a1) ; <<32 x i16>> [#uses=1]
	%mask.cast = bitcast i32 %mask to <32 x i1>
	%res2 = select <32 x i1> %mask.cast, <32 x i16> %res, <32 x i16> %passthru
	ret <32 x i16> %res2
	}
	define <32 x i16> @test_x86_avx512_maskz_psrl_w_512(<32 x i16> %a0, <8 x i16> %a1, i32 %mask) {
	; AVX512BW-LABEL: test_x86_avx512_maskz_psrl_w_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %edi, %k1
	; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 {%k1} {z}
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_x86_avx512_maskz_psrl_w_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 {%k1} {z}
	; AVX512F-32-NEXT: retl
	%res = call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> %a0, <8 x i16> %a1) ; <<32 x i16>> [#uses=1]
	%mask.cast = bitcast i32 %mask to <32 x i1>
	%res2 = select <32 x i1> %mask.cast, <32 x i16> %res, <32 x i16> zeroinitializer
	ret <32 x i16> %res2
	}
	declare <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16>, <8 x i16>) nounwind readnone


	define <32 x i16> @test_x86_avx512_psrli_w_512(<32 x i16> %a0) {
	; AVX512BW-LABEL: test_x86_avx512_psrli_w_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: vpsrlw $7, %zmm0, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_x86_avx512_psrli_w_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: vpsrlw $7, %zmm0, %zmm0
	; AVX512F-32-NEXT: retl
	%res = call <32 x i16> @llvm.x86.avx512.psrli.w.512(<32 x i16> %a0, i32 7) ; <<32 x i16>> [#uses=1]
	ret <32 x i16> %res
	}
	define <32 x i16> @test_x86_avx512_mask_psrli_w_512(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) {
	; AVX512BW-LABEL: test_x86_avx512_mask_psrli_w_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %edi, %k1
	; AVX512BW-NEXT: vpsrlw $7, %zmm0, %zmm1 {%k1}
	; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_x86_avx512_mask_psrli_w_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpsrlw $7, %zmm0, %zmm1 {%k1}
	; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0
	; AVX512F-32-NEXT: retl
	%res = call <32 x i16> @llvm.x86.avx512.psrli.w.512(<32 x i16> %a0, i32 7) ; <<32 x i16>> [#uses=1]
	%mask.cast = bitcast i32 %mask to <32 x i1>
	%res2 = select <32 x i1> %mask.cast, <32 x i16> %res, <32 x i16> %passthru
	ret <32 x i16> %res2
	}
	define <32 x i16> @test_x86_avx512_maskz_psrli_w_512(<32 x i16> %a0, i32 %mask) {
	; AVX512BW-LABEL: test_x86_avx512_maskz_psrli_w_512:
	; AVX512BW: ## BB#0:
	; AVX512BW-NEXT: kmovd %edi, %k1
	; AVX512BW-NEXT: vpsrlw $7, %zmm0, %zmm0 {%k1} {z}
	; AVX512BW-NEXT: retq
	;
	; AVX512F-32-LABEL: test_x86_avx512_maskz_psrli_w_512:
	; AVX512F-32: # BB#0:
	; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
	; AVX512F-32-NEXT: vpsrlw $7, %zmm0, %zmm0 {%k1} {z}
	; AVX512F-32-NEXT: retl
	%res = call <32 x i16> @llvm.x86.avx512.psrli.w.512(<32 x i16> %a0, i32 7) ; <<32 x i16>> [#uses=1]
	%mask.cast = bitcast i32 %mask to <32 x i1>
	%res2 = select <32 x i1> %mask.cast, <32 x i16> %res, <32 x i16> zeroinitializer
	ret <32 x i16> %res2
	}
	declare <32 x i16> @llvm.x86.avx512.psrli.w.512(<32 x i16>, i32) nounwind readnone
	diff --git a/test/CodeGen/X86/avx512bwvl-intrinsics.ll b/test/CodeGen/X86/avx512bwvl-intrinsics.ll
	index c3ba6f106e6a..9ceb3e5931a6 100644
	--- a/test/CodeGen/X86/avx512bwvl-intrinsics.ll
	+++ b/test/CodeGen/X86/avx512bwvl-intrinsics.ll
	@@ -1,2874 +1,2874 @@
	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw -mattr=+avx512vl --show-mc-encoding\| FileCheck %s

	define <8 x i16> @test_mask_packs_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
	; CHECK-LABEL: test_mask_packs_epi32_rr_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6b,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b)
	ret <8 x i16> %1
	}

	define <8 x i16> @test_mask_packs_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask) {
	; CHECK-LABEL: test_mask_packs_epi32_rrk_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x6b,0xd1]
	; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b)
	%2 = bitcast i8 %mask to <8 x i1>
	%3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passThru
	ret <8 x i16> %3
	}

	define <8 x i16> @test_mask_packs_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
	; CHECK-LABEL: test_mask_packs_epi32_rrkz_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x6b,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b)
	%2 = bitcast i8 %mask to <8 x i1>
	%3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer
	ret <8 x i16> %3
	}

	define <8 x i16> @test_mask_packs_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
	; CHECK-LABEL: test_mask_packs_epi32_rm_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpackssdw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6b,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <4 x i32>, <4 x i32>* %ptr_b
	%1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b)
	ret <8 x i16> %1
	}

	define <8 x i16> @test_mask_packs_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
	; CHECK-LABEL: test_mask_packs_epi32_rmk_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpackssdw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x6b,0x0f]
	; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <4 x i32>, <4 x i32>* %ptr_b
	%1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b)
	%2 = bitcast i8 %mask to <8 x i1>
	%3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passThru
	ret <8 x i16> %3
	}

	define <8 x i16> @test_mask_packs_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) {
	; CHECK-LABEL: test_mask_packs_epi32_rmkz_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpackssdw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x6b,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <4 x i32>, <4 x i32>* %ptr_b
	%1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b)
	%2 = bitcast i8 %mask to <8 x i1>
	%3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer
	ret <8 x i16> %3
	}

	define <8 x i16> @test_mask_packs_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
	; CHECK-LABEL: test_mask_packs_epi32_rmb_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpackssdw (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x18,0x6b,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%q = load i32, i32* %ptr_b
	%vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
	%b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
	%1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b)
	ret <8 x i16> %1
	}

	define <8 x i16> @test_mask_packs_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <8 x i16> %passThru, i8 %mask) {
	; CHECK-LABEL: test_mask_packs_epi32_rmbk_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpackssdw (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0x6b,0x0f]
	; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%q = load i32, i32* %ptr_b
	%vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
	%b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
	%1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b)
	%2 = bitcast i8 %mask to <8 x i1>
	%3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passThru
	ret <8 x i16> %3
	}

	define <8 x i16> @test_mask_packs_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) {
	; CHECK-LABEL: test_mask_packs_epi32_rmbkz_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpackssdw (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0x6b,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%q = load i32, i32* %ptr_b
	%vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
	%b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
	%1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b)
	%2 = bitcast i8 %mask to <8 x i1>
	%3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer
	ret <8 x i16> %3
	}

	declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>)

	define <16 x i16> @test_mask_packs_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
	; CHECK-LABEL: test_mask_packs_epi32_rr_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6b,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b)
	ret <16 x i16> %1
	}

	define <16 x i16> @test_mask_packs_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask) {
	; CHECK-LABEL: test_mask_packs_epi32_rrk_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpackssdw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x6b,0xd1]
	; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b)
	%2 = bitcast i16 %mask to <16 x i1>
	%3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passThru
	ret <16 x i16> %3
	}

	define <16 x i16> @test_mask_packs_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i16 %mask) {
	; CHECK-LABEL: test_mask_packs_epi32_rrkz_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x6b,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b)
	%2 = bitcast i16 %mask to <16 x i1>
	%3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer
	ret <16 x i16> %3
	}

	define <16 x i16> @test_mask_packs_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
	; CHECK-LABEL: test_mask_packs_epi32_rm_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpackssdw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6b,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <8 x i32>, <8 x i32>* %ptr_b
	%1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b)
	ret <16 x i16> %1
	}

	define <16 x i16> @test_mask_packs_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
	; CHECK-LABEL: test_mask_packs_epi32_rmk_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpackssdw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x6b,0x0f]
	; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <8 x i32>, <8 x i32>* %ptr_b
	%1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b)
	%2 = bitcast i16 %mask to <16 x i1>
	%3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passThru
	ret <16 x i16> %3
	}

	define <16 x i16> @test_mask_packs_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i16 %mask) {
	; CHECK-LABEL: test_mask_packs_epi32_rmkz_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpackssdw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x6b,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <8 x i32>, <8 x i32>* %ptr_b
	%1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b)
	%2 = bitcast i16 %mask to <16 x i1>
	%3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer
	ret <16 x i16> %3
	}

	define <16 x i16> @test_mask_packs_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
	; CHECK-LABEL: test_mask_packs_epi32_rmb_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpackssdw (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x38,0x6b,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%q = load i32, i32* %ptr_b
	%vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
	%b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
	%1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b)
	ret <16 x i16> %1
	}

	define <16 x i16> @test_mask_packs_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <16 x i16> %passThru, i16 %mask) {
	; CHECK-LABEL: test_mask_packs_epi32_rmbk_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpackssdw (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0x6b,0x0f]
	; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%q = load i32, i32* %ptr_b
	%vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
	%b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
	%1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b)
	%2 = bitcast i16 %mask to <16 x i1>
	%3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passThru
	ret <16 x i16> %3
	}

	define <16 x i16> @test_mask_packs_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i16 %mask) {
	; CHECK-LABEL: test_mask_packs_epi32_rmbkz_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpackssdw (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0x6b,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%q = load i32, i32* %ptr_b
	%vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
	%b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
	%1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b)
	%2 = bitcast i16 %mask to <16 x i1>
	%3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer
	ret <16 x i16> %3
	}

	declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>)

	define <16 x i8> @test_mask_packs_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
	; CHECK-LABEL: test_mask_packs_epi16_rr_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x63,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a, <8 x i16> %b)
	ret <16 x i8> %1
	}

	define <16 x i8> @test_mask_packs_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask) {
	; CHECK-LABEL: test_mask_packs_epi16_rrk_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpacksswb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x63,0xd1]
	; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a, <8 x i16> %b)
	%2 = bitcast i16 %mask to <16 x i1>
	%3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passThru
	ret <16 x i8> %3
	}

	define <16 x i8> @test_mask_packs_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i16 %mask) {
	; CHECK-LABEL: test_mask_packs_epi16_rrkz_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x63,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a, <8 x i16> %b)
	%2 = bitcast i16 %mask to <16 x i1>
	%3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer
	ret <16 x i8> %3
	}

	define <16 x i8> @test_mask_packs_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
	; CHECK-LABEL: test_mask_packs_epi16_rm_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpacksswb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x63,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <8 x i16>, <8 x i16>* %ptr_b
	%1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a, <8 x i16> %b)
	ret <16 x i8> %1
	}

	define <16 x i8> @test_mask_packs_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
	; CHECK-LABEL: test_mask_packs_epi16_rmk_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpacksswb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x63,0x0f]
	; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <8 x i16>, <8 x i16>* %ptr_b
	%1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a, <8 x i16> %b)
	%2 = bitcast i16 %mask to <16 x i1>
	%3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passThru
	ret <16 x i8> %3
	}

	define <16 x i8> @test_mask_packs_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i16 %mask) {
	; CHECK-LABEL: test_mask_packs_epi16_rmkz_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpacksswb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x63,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <8 x i16>, <8 x i16>* %ptr_b
	%1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a, <8 x i16> %b)
	%2 = bitcast i16 %mask to <16 x i1>
	%3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer
	ret <16 x i8> %3
	}

	declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>)

	define <32 x i8> @test_mask_packs_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
	; CHECK-LABEL: test_mask_packs_epi16_rr_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x63,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a, <16 x i16> %b)
	ret <32 x i8> %1
	}

	define <32 x i8> @test_mask_packs_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask) {
	; CHECK-LABEL: test_mask_packs_epi16_rrk_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpacksswb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x63,0xd1]
	; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a, <16 x i16> %b)
	%2 = bitcast i32 %mask to <32 x i1>
	%3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passThru
	ret <32 x i8> %3
	}

	define <32 x i8> @test_mask_packs_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i32 %mask) {
	; CHECK-LABEL: test_mask_packs_epi16_rrkz_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x63,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a, <16 x i16> %b)
	%2 = bitcast i32 %mask to <32 x i1>
	%3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer
	ret <32 x i8> %3
	}

	define <32 x i8> @test_mask_packs_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
	; CHECK-LABEL: test_mask_packs_epi16_rm_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpacksswb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x63,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <16 x i16>, <16 x i16>* %ptr_b
	%1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a, <16 x i16> %b)
	ret <32 x i8> %1
	}

	define <32 x i8> @test_mask_packs_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
	; CHECK-LABEL: test_mask_packs_epi16_rmk_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpacksswb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x63,0x0f]
	; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <16 x i16>, <16 x i16>* %ptr_b
	%1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a, <16 x i16> %b)
	%2 = bitcast i32 %mask to <32 x i1>
	%3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passThru
	ret <32 x i8> %3
	}

	define <32 x i8> @test_mask_packs_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i32 %mask) {
	; CHECK-LABEL: test_mask_packs_epi16_rmkz_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpacksswb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x63,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <16 x i16>, <16 x i16>* %ptr_b
	%1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a, <16 x i16> %b)
	%2 = bitcast i32 %mask to <32 x i1>
	%3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer
	ret <32 x i8> %3
	}

	declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>)


	define <8 x i16> @test_mask_packus_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
	; CHECK-LABEL: test_mask_packus_epi32_rr_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x2b,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b)
	ret <8 x i16> %1
	}

	define <8 x i16> @test_mask_packus_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask) {
	; CHECK-LABEL: test_mask_packus_epi32_rrk_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpackusdw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x2b,0xd1]
	; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b)
	%2 = bitcast i8 %mask to <8 x i1>
	%3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passThru
	ret <8 x i16> %3
	}

	define <8 x i16> @test_mask_packus_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
	; CHECK-LABEL: test_mask_packus_epi32_rrkz_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x2b,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b)
	%2 = bitcast i8 %mask to <8 x i1>
	%3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer
	ret <8 x i16> %3
	}

	define <8 x i16> @test_mask_packus_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
	; CHECK-LABEL: test_mask_packus_epi32_rm_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x2b,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <4 x i32>, <4 x i32>* %ptr_b
	%1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b)
	ret <8 x i16> %1
	}

	define <8 x i16> @test_mask_packus_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
	; CHECK-LABEL: test_mask_packus_epi32_rmk_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpackusdw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x2b,0x0f]
	; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <4 x i32>, <4 x i32>* %ptr_b
	%1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b)
	%2 = bitcast i8 %mask to <8 x i1>
	%3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passThru
	ret <8 x i16> %3
	}

	define <8 x i16> @test_mask_packus_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) {
	; CHECK-LABEL: test_mask_packus_epi32_rmkz_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x2b,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <4 x i32>, <4 x i32>* %ptr_b
	%1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b)
	%2 = bitcast i8 %mask to <8 x i1>
	%3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer
	ret <8 x i16> %3
	}

	define <8 x i16> @test_mask_packus_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
	; CHECK-LABEL: test_mask_packus_epi32_rmb_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpackusdw (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x18,0x2b,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%q = load i32, i32* %ptr_b
	%vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
	%b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
	%1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b)
	ret <8 x i16> %1
	}

	define <8 x i16> @test_mask_packus_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <8 x i16> %passThru, i8 %mask) {
	; CHECK-LABEL: test_mask_packus_epi32_rmbk_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpackusdw (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x19,0x2b,0x0f]
	; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%q = load i32, i32* %ptr_b
	%vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
	%b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
	%1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b)
	%2 = bitcast i8 %mask to <8 x i1>
	%3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passThru
	ret <8 x i16> %3
	}

	define <8 x i16> @test_mask_packus_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) {
	; CHECK-LABEL: test_mask_packus_epi32_rmbkz_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpackusdw (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x99,0x2b,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%q = load i32, i32* %ptr_b
	%vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
	%b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
	%1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b)
	%2 = bitcast i8 %mask to <8 x i1>
	%3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer
	ret <8 x i16> %3
	}

	declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>)

	define <16 x i16> @test_mask_packus_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
	; CHECK-LABEL: test_mask_packus_epi32_rr_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x2b,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b)
	ret <16 x i16> %1
	}

	define <16 x i16> @test_mask_packus_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask) {
	; CHECK-LABEL: test_mask_packus_epi32_rrk_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpackusdw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x2b,0xd1]
	; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b)
	%2 = bitcast i16 %mask to <16 x i1>
	%3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passThru
	ret <16 x i16> %3
	}

	define <16 x i16> @test_mask_packus_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i16 %mask) {
	; CHECK-LABEL: test_mask_packus_epi32_rrkz_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x2b,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b)
	%2 = bitcast i16 %mask to <16 x i1>
	%3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer
	ret <16 x i16> %3
	}

	define <16 x i16> @test_mask_packus_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
	; CHECK-LABEL: test_mask_packus_epi32_rm_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpackusdw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x2b,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <8 x i32>, <8 x i32>* %ptr_b
	%1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b)
	ret <16 x i16> %1
	}

	define <16 x i16> @test_mask_packus_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
	; CHECK-LABEL: test_mask_packus_epi32_rmk_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpackusdw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x2b,0x0f]
	; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <8 x i32>, <8 x i32>* %ptr_b
	%1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b)
	%2 = bitcast i16 %mask to <16 x i1>
	%3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passThru
	ret <16 x i16> %3
	}

	define <16 x i16> @test_mask_packus_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i16 %mask) {
	; CHECK-LABEL: test_mask_packus_epi32_rmkz_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpackusdw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x2b,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <8 x i32>, <8 x i32>* %ptr_b
	%1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b)
	%2 = bitcast i16 %mask to <16 x i1>
	%3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer
	ret <16 x i16> %3
	}

	define <16 x i16> @test_mask_packus_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
	; CHECK-LABEL: test_mask_packus_epi32_rmb_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpackusdw (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x38,0x2b,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%q = load i32, i32* %ptr_b
	%vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
	%b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
	%1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b)
	ret <16 x i16> %1
	}

	define <16 x i16> @test_mask_packus_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <16 x i16> %passThru, i16 %mask) {
	; CHECK-LABEL: test_mask_packus_epi32_rmbk_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpackusdw (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x39,0x2b,0x0f]
	; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%q = load i32, i32* %ptr_b
	%vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
	%b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
	%1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b)
	%2 = bitcast i16 %mask to <16 x i1>
	%3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passThru
	ret <16 x i16> %3
	}

	define <16 x i16> @test_mask_packus_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i16 %mask) {
	; CHECK-LABEL: test_mask_packus_epi32_rmbkz_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpackusdw (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xb9,0x2b,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%q = load i32, i32* %ptr_b
	%vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
	%b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
	%1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b)
	%2 = bitcast i16 %mask to <16 x i1>
	%3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer
	ret <16 x i16> %3
	}

	declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>)

	define <16 x i8> @test_mask_packus_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
	; CHECK-LABEL: test_mask_packus_epi16_rr_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x67,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a, <8 x i16> %b)
	ret <16 x i8> %1
	}

	define <16 x i8> @test_mask_packus_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask) {
	; CHECK-LABEL: test_mask_packus_epi16_rrk_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpackuswb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x67,0xd1]
	; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a, <8 x i16> %b)
	%2 = bitcast i16 %mask to <16 x i1>
	%3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passThru
	ret <16 x i8> %3
	}

	define <16 x i8> @test_mask_packus_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i16 %mask) {
	; CHECK-LABEL: test_mask_packus_epi16_rrkz_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x67,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a, <8 x i16> %b)
	%2 = bitcast i16 %mask to <16 x i1>
	%3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer
	ret <16 x i8> %3
	}

	define <16 x i8> @test_mask_packus_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
	; CHECK-LABEL: test_mask_packus_epi16_rm_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpackuswb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x67,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <8 x i16>, <8 x i16>* %ptr_b
	%1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a, <8 x i16> %b)
	ret <16 x i8> %1
	}

	define <16 x i8> @test_mask_packus_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
	; CHECK-LABEL: test_mask_packus_epi16_rmk_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpackuswb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x67,0x0f]
	; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <8 x i16>, <8 x i16>* %ptr_b
	%1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a, <8 x i16> %b)
	%2 = bitcast i16 %mask to <16 x i1>
	%3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passThru
	ret <16 x i8> %3
	}

	define <16 x i8> @test_mask_packus_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i16 %mask) {
	; CHECK-LABEL: test_mask_packus_epi16_rmkz_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpackuswb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x67,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <8 x i16>, <8 x i16>* %ptr_b
	%1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a, <8 x i16> %b)
	%2 = bitcast i16 %mask to <16 x i1>
	%3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer
	ret <16 x i8> %3
	}

	declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>)

	define <32 x i8> @test_mask_packus_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
	; CHECK-LABEL: test_mask_packus_epi16_rr_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x67,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b)
	ret <32 x i8> %1
	}

	define <32 x i8> @test_mask_packus_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask) {
	; CHECK-LABEL: test_mask_packus_epi16_rrk_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpackuswb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x67,0xd1]
	; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b)
	%2 = bitcast i32 %mask to <32 x i1>
	%3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passThru
	ret <32 x i8> %3
	}

	define <32 x i8> @test_mask_packus_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i32 %mask) {
	; CHECK-LABEL: test_mask_packus_epi16_rrkz_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x67,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b)
	%2 = bitcast i32 %mask to <32 x i1>
	%3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer
	ret <32 x i8> %3
	}

	define <32 x i8> @test_mask_packus_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
	; CHECK-LABEL: test_mask_packus_epi16_rm_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpackuswb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x67,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <16 x i16>, <16 x i16>* %ptr_b
	%1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b)
	ret <32 x i8> %1
	}

	define <32 x i8> @test_mask_packus_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
	; CHECK-LABEL: test_mask_packus_epi16_rmk_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpackuswb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x67,0x0f]
	; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <16 x i16>, <16 x i16>* %ptr_b
	%1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b)
	%2 = bitcast i32 %mask to <32 x i1>
	%3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passThru
	ret <32 x i8> %3
	}

	define <32 x i8> @test_mask_packus_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i32 %mask) {
	; CHECK-LABEL: test_mask_packus_epi16_rmkz_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpackuswb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x67,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <16 x i16>, <16 x i16>* %ptr_b
	%1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b)
	%2 = bitcast i32 %mask to <32 x i1>
	%3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer
	ret <32 x i8> %3
	}

	declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>)

	define <8 x i16> @test_mask_adds_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
	; CHECK-LABEL: test_mask_adds_epi16_rr_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xed,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
	ret <8 x i16> %res
	}

	define <8 x i16> @test_mask_adds_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) {
	; CHECK-LABEL: test_mask_adds_epi16_rrk_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpaddsw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xed,0xd1]
	; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
	ret <8 x i16> %res
	}

	define <8 x i16> @test_mask_adds_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
	; CHECK-LABEL: test_mask_adds_epi16_rrkz_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xed,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
	ret <8 x i16> %res
	}

	define <8 x i16> @test_mask_adds_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
	; CHECK-LABEL: test_mask_adds_epi16_rm_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xed,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <8 x i16>, <8 x i16>* %ptr_b
	%res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
	ret <8 x i16> %res
	}

	define <8 x i16> @test_mask_adds_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
	; CHECK-LABEL: test_mask_adds_epi16_rmk_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpaddsw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xed,0x0f]
	; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <8 x i16>, <8 x i16>* %ptr_b
	%res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
	ret <8 x i16> %res
	}

	define <8 x i16> @test_mask_adds_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) {
	; CHECK-LABEL: test_mask_adds_epi16_rmkz_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xed,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <8 x i16>, <8 x i16>* %ptr_b
	%res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
	ret <8 x i16> %res
	}

	declare <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)

	define <16 x i16> @test_mask_adds_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
	; CHECK-LABEL: test_mask_adds_epi16_rr_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xed,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
	ret <16 x i16> %res
	}

	define <16 x i16> @test_mask_adds_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) {
	; CHECK-LABEL: test_mask_adds_epi16_rrk_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpaddsw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xed,0xd1]
	; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
	ret <16 x i16> %res
	}

	define <16 x i16> @test_mask_adds_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
	; CHECK-LABEL: test_mask_adds_epi16_rrkz_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xed,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
	ret <16 x i16> %res
	}

	define <16 x i16> @test_mask_adds_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
	; CHECK-LABEL: test_mask_adds_epi16_rm_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpaddsw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xed,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <16 x i16>, <16 x i16>* %ptr_b
	%res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
	ret <16 x i16> %res
	}

	define <16 x i16> @test_mask_adds_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
	; CHECK-LABEL: test_mask_adds_epi16_rmk_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpaddsw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xed,0x0f]
	; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <16 x i16>, <16 x i16>* %ptr_b
	%res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
	ret <16 x i16> %res
	}

	define <16 x i16> @test_mask_adds_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) {
	; CHECK-LABEL: test_mask_adds_epi16_rmkz_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpaddsw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xed,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <16 x i16>, <16 x i16>* %ptr_b
	%res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
	ret <16 x i16> %res
	}

	declare <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)

	define <8 x i16> @test_mask_subs_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
	; CHECK-LABEL: test_mask_subs_epi16_rr_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe9,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
	ret <8 x i16> %res
	}

	define <8 x i16> @test_mask_subs_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) {
	; CHECK-LABEL: test_mask_subs_epi16_rrk_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpsubsw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe9,0xd1]
	; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
	ret <8 x i16> %res
	}

	define <8 x i16> @test_mask_subs_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
	; CHECK-LABEL: test_mask_subs_epi16_rrkz_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe9,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
	ret <8 x i16> %res
	}

	define <8 x i16> @test_mask_subs_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
	; CHECK-LABEL: test_mask_subs_epi16_rm_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe9,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <8 x i16>, <8 x i16>* %ptr_b
	%res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
	ret <8 x i16> %res
	}

	define <8 x i16> @test_mask_subs_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
	; CHECK-LABEL: test_mask_subs_epi16_rmk_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpsubsw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe9,0x0f]
	; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <8 x i16>, <8 x i16>* %ptr_b
	%res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
	ret <8 x i16> %res
	}

	define <8 x i16> @test_mask_subs_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) {
	; CHECK-LABEL: test_mask_subs_epi16_rmkz_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe9,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <8 x i16>, <8 x i16>* %ptr_b
	%res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
	ret <8 x i16> %res
	}

	declare <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)

	define <16 x i16> @test_mask_subs_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
	; CHECK-LABEL: test_mask_subs_epi16_rr_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe9,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
	ret <16 x i16> %res
	}

	define <16 x i16> @test_mask_subs_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) {
	; CHECK-LABEL: test_mask_subs_epi16_rrk_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpsubsw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe9,0xd1]
	; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
	ret <16 x i16> %res
	}

	define <16 x i16> @test_mask_subs_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
	; CHECK-LABEL: test_mask_subs_epi16_rrkz_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe9,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
	ret <16 x i16> %res
	}

	define <16 x i16> @test_mask_subs_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
	; CHECK-LABEL: test_mask_subs_epi16_rm_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpsubsw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe9,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <16 x i16>, <16 x i16>* %ptr_b
	%res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
	ret <16 x i16> %res
	}

	define <16 x i16> @test_mask_subs_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
	; CHECK-LABEL: test_mask_subs_epi16_rmk_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpsubsw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe9,0x0f]
	; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <16 x i16>, <16 x i16>* %ptr_b
	%res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
	ret <16 x i16> %res
	}

	define <16 x i16> @test_mask_subs_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) {
	; CHECK-LABEL: test_mask_subs_epi16_rmkz_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpsubsw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe9,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <16 x i16>, <16 x i16>* %ptr_b
	%res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
	ret <16 x i16> %res
	}

	declare <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)

	define <8 x i16> @test_mask_adds_epu16_rr_128(<8 x i16> %a, <8 x i16> %b) {
	; CHECK-LABEL: test_mask_adds_epu16_rr_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdd,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
	ret <8 x i16> %res
	}

	define <8 x i16> @test_mask_adds_epu16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) {
	; CHECK-LABEL: test_mask_adds_epu16_rrk_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpaddusw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdd,0xd1]
	; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
	ret <8 x i16> %res
	}

	define <8 x i16> @test_mask_adds_epu16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
	; CHECK-LABEL: test_mask_adds_epu16_rrkz_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdd,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
	ret <8 x i16> %res
	}

	define <8 x i16> @test_mask_adds_epu16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
	; CHECK-LABEL: test_mask_adds_epu16_rm_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdd,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <8 x i16>, <8 x i16>* %ptr_b
	%res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
	ret <8 x i16> %res
	}

	define <8 x i16> @test_mask_adds_epu16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
	; CHECK-LABEL: test_mask_adds_epu16_rmk_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpaddusw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdd,0x0f]
	; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <8 x i16>, <8 x i16>* %ptr_b
	%res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
	ret <8 x i16> %res
	}

	define <8 x i16> @test_mask_adds_epu16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) {
	; CHECK-LABEL: test_mask_adds_epu16_rmkz_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdd,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <8 x i16>, <8 x i16>* %ptr_b
	%res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
	ret <8 x i16> %res
	}

	declare <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)

	define <16 x i16> @test_mask_adds_epu16_rr_256(<16 x i16> %a, <16 x i16> %b) {
	; CHECK-LABEL: test_mask_adds_epu16_rr_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdd,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
	ret <16 x i16> %res
	}

	define <16 x i16> @test_mask_adds_epu16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) {
	; CHECK-LABEL: test_mask_adds_epu16_rrk_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpaddusw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdd,0xd1]
	; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
	ret <16 x i16> %res
	}

	define <16 x i16> @test_mask_adds_epu16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
	; CHECK-LABEL: test_mask_adds_epu16_rrkz_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdd,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
	ret <16 x i16> %res
	}

	define <16 x i16> @test_mask_adds_epu16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
	; CHECK-LABEL: test_mask_adds_epu16_rm_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpaddusw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdd,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <16 x i16>, <16 x i16>* %ptr_b
	%res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
	ret <16 x i16> %res
	}

	define <16 x i16> @test_mask_adds_epu16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
	; CHECK-LABEL: test_mask_adds_epu16_rmk_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpaddusw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdd,0x0f]
	; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <16 x i16>, <16 x i16>* %ptr_b
	%res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
	ret <16 x i16> %res
	}

	define <16 x i16> @test_mask_adds_epu16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) {
	; CHECK-LABEL: test_mask_adds_epu16_rmkz_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpaddusw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdd,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <16 x i16>, <16 x i16>* %ptr_b
	%res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
	ret <16 x i16> %res
	}

	declare <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)

	define <8 x i16> @test_mask_subs_epu16_rr_128(<8 x i16> %a, <8 x i16> %b) {
	; CHECK-LABEL: test_mask_subs_epu16_rr_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd9,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
	ret <8 x i16> %res
	}

	define <8 x i16> @test_mask_subs_epu16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) {
	; CHECK-LABEL: test_mask_subs_epu16_rrk_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpsubusw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd9,0xd1]
	; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
	ret <8 x i16> %res
	}

	define <8 x i16> @test_mask_subs_epu16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
	; CHECK-LABEL: test_mask_subs_epu16_rrkz_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd9,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
	ret <8 x i16> %res
	}

	define <8 x i16> @test_mask_subs_epu16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
	; CHECK-LABEL: test_mask_subs_epu16_rm_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd9,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <8 x i16>, <8 x i16>* %ptr_b
	%res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
	ret <8 x i16> %res
	}

	define <8 x i16> @test_mask_subs_epu16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
	; CHECK-LABEL: test_mask_subs_epu16_rmk_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpsubusw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd9,0x0f]
	; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <8 x i16>, <8 x i16>* %ptr_b
	%res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
	ret <8 x i16> %res
	}

	define <8 x i16> @test_mask_subs_epu16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) {
	; CHECK-LABEL: test_mask_subs_epu16_rmkz_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd9,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <8 x i16>, <8 x i16>* %ptr_b
	%res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
	ret <8 x i16> %res
	}

	declare <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)

	define <16 x i16> @test_mask_subs_epu16_rr_256(<16 x i16> %a, <16 x i16> %b) {
	; CHECK-LABEL: test_mask_subs_epu16_rr_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd9,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
	ret <16 x i16> %res
	}

	define <16 x i16> @test_mask_subs_epu16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) {
	; CHECK-LABEL: test_mask_subs_epu16_rrk_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpsubusw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd9,0xd1]
	; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
	ret <16 x i16> %res
	}

	define <16 x i16> @test_mask_subs_epu16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
	; CHECK-LABEL: test_mask_subs_epu16_rrkz_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd9,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
	ret <16 x i16> %res
	}

	define <16 x i16> @test_mask_subs_epu16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
	; CHECK-LABEL: test_mask_subs_epu16_rm_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpsubusw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd9,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <16 x i16>, <16 x i16>* %ptr_b
	%res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
	ret <16 x i16> %res
	}

	define <16 x i16> @test_mask_subs_epu16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
	; CHECK-LABEL: test_mask_subs_epu16_rmk_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpsubusw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd9,0x0f]
	; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <16 x i16>, <16 x i16>* %ptr_b
	%res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
	ret <16 x i16> %res
	}

	define <16 x i16> @test_mask_subs_epu16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) {
	; CHECK-LABEL: test_mask_subs_epu16_rmkz_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpsubusw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd9,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <16 x i16>, <16 x i16>* %ptr_b
	%res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
	ret <16 x i16> %res
	}

	declare <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)

	define <16 x i8> @test_mask_adds_epi8_rr_128(<16 x i8> %a, <16 x i8> %b) {
	; CHECK-LABEL: test_mask_adds_epi8_rr_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xec,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
	ret <16 x i8> %res
	}

	define <16 x i8> @test_mask_adds_epi8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) {
	; CHECK-LABEL: test_mask_adds_epi8_rrk_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpaddsb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xec,0xd1]
	; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask)
	ret <16 x i8> %res
	}

	define <16 x i8> @test_mask_adds_epi8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
	; CHECK-LABEL: test_mask_adds_epi8_rrkz_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xec,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
	ret <16 x i8> %res
	}

	define <16 x i8> @test_mask_adds_epi8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) {
	; CHECK-LABEL: test_mask_adds_epi8_rm_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xec,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <16 x i8>, <16 x i8>* %ptr_b
	%res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
	ret <16 x i8> %res
	}

	define <16 x i8> @test_mask_adds_epi8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
	; CHECK-LABEL: test_mask_adds_epi8_rmk_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpaddsb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xec,0x0f]
	; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <16 x i8>, <16 x i8>* %ptr_b
	%res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask)
	ret <16 x i8> %res
	}

	define <16 x i8> @test_mask_adds_epi8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) {
	; CHECK-LABEL: test_mask_adds_epi8_rmkz_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xec,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <16 x i8>, <16 x i8>* %ptr_b
	%res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
	ret <16 x i8> %res
	}

	declare <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)

	define <32 x i8> @test_mask_adds_epi8_rr_256(<32 x i8> %a, <32 x i8> %b) {
	; CHECK-LABEL: test_mask_adds_epi8_rr_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xec,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
	ret <32 x i8> %res
	}

	define <32 x i8> @test_mask_adds_epi8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) {
	; CHECK-LABEL: test_mask_adds_epi8_rrk_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpaddsb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xec,0xd1]
	; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask)
	ret <32 x i8> %res
	}

	define <32 x i8> @test_mask_adds_epi8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
	; CHECK-LABEL: test_mask_adds_epi8_rrkz_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xec,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask)
	ret <32 x i8> %res
	}

	define <32 x i8> @test_mask_adds_epi8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) {
	; CHECK-LABEL: test_mask_adds_epi8_rm_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpaddsb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xec,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <32 x i8>, <32 x i8>* %ptr_b
	%res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
	ret <32 x i8> %res
	}

	define <32 x i8> @test_mask_adds_epi8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
	; CHECK-LABEL: test_mask_adds_epi8_rmk_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpaddsb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xec,0x0f]
	; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <32 x i8>, <32 x i8>* %ptr_b
	%res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask)
	ret <32 x i8> %res
	}

	define <32 x i8> @test_mask_adds_epi8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) {
	; CHECK-LABEL: test_mask_adds_epi8_rmkz_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpaddsb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xec,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <32 x i8>, <32 x i8>* %ptr_b
	%res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask)
	ret <32 x i8> %res
	}

	declare <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)

	define <16 x i8> @test_mask_subs_epi8_rr_128(<16 x i8> %a, <16 x i8> %b) {
	; CHECK-LABEL: test_mask_subs_epi8_rr_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe8,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
	ret <16 x i8> %res
	}

	define <16 x i8> @test_mask_subs_epi8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) {
	; CHECK-LABEL: test_mask_subs_epi8_rrk_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpsubsb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe8,0xd1]
	; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask)
	ret <16 x i8> %res
	}

	define <16 x i8> @test_mask_subs_epi8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
	; CHECK-LABEL: test_mask_subs_epi8_rrkz_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe8,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
	ret <16 x i8> %res
	}

	define <16 x i8> @test_mask_subs_epi8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) {
	; CHECK-LABEL: test_mask_subs_epi8_rm_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe8,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <16 x i8>, <16 x i8>* %ptr_b
	%res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
	ret <16 x i8> %res
	}

	define <16 x i8> @test_mask_subs_epi8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
	; CHECK-LABEL: test_mask_subs_epi8_rmk_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpsubsb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe8,0x0f]
	; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <16 x i8>, <16 x i8>* %ptr_b
	%res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask)
	ret <16 x i8> %res
	}

	define <16 x i8> @test_mask_subs_epi8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) {
	; CHECK-LABEL: test_mask_subs_epi8_rmkz_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe8,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <16 x i8>, <16 x i8>* %ptr_b
	%res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
	ret <16 x i8> %res
	}

	declare <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)

	define <32 x i8> @test_mask_subs_epi8_rr_256(<32 x i8> %a, <32 x i8> %b) {
	; CHECK-LABEL: test_mask_subs_epi8_rr_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe8,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
	ret <32 x i8> %res
	}

	define <32 x i8> @test_mask_subs_epi8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) {
	; CHECK-LABEL: test_mask_subs_epi8_rrk_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpsubsb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe8,0xd1]
	; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask)
	ret <32 x i8> %res
	}

	define <32 x i8> @test_mask_subs_epi8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
	; CHECK-LABEL: test_mask_subs_epi8_rrkz_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe8,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask)
	ret <32 x i8> %res
	}

	define <32 x i8> @test_mask_subs_epi8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) {
	; CHECK-LABEL: test_mask_subs_epi8_rm_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpsubsb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe8,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <32 x i8>, <32 x i8>* %ptr_b
	%res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
	ret <32 x i8> %res
	}

	define <32 x i8> @test_mask_subs_epi8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
	; CHECK-LABEL: test_mask_subs_epi8_rmk_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpsubsb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe8,0x0f]
	; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <32 x i8>, <32 x i8>* %ptr_b
	%res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask)
	ret <32 x i8> %res
	}

	define <32 x i8> @test_mask_subs_epi8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) {
	; CHECK-LABEL: test_mask_subs_epi8_rmkz_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpsubsb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe8,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <32 x i8>, <32 x i8>* %ptr_b
	%res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask)
	ret <32 x i8> %res
	}

	declare <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)

	define <16 x i8> @test_mask_adds_epu8_rr_128(<16 x i8> %a, <16 x i8> %b) {
	; CHECK-LABEL: test_mask_adds_epu8_rr_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdc,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
	ret <16 x i8> %res
	}

	define <16 x i8> @test_mask_adds_epu8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) {
	; CHECK-LABEL: test_mask_adds_epu8_rrk_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpaddusb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdc,0xd1]
	; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask)
	ret <16 x i8> %res
	}

	define <16 x i8> @test_mask_adds_epu8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
	; CHECK-LABEL: test_mask_adds_epu8_rrkz_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdc,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
	ret <16 x i8> %res
	}

	define <16 x i8> @test_mask_adds_epu8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) {
	; CHECK-LABEL: test_mask_adds_epu8_rm_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdc,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <16 x i8>, <16 x i8>* %ptr_b
	%res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
	ret <16 x i8> %res
	}

	define <16 x i8> @test_mask_adds_epu8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
	; CHECK-LABEL: test_mask_adds_epu8_rmk_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpaddusb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdc,0x0f]
	; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <16 x i8>, <16 x i8>* %ptr_b
	%res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask)
	ret <16 x i8> %res
	}

	define <16 x i8> @test_mask_adds_epu8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) {
	; CHECK-LABEL: test_mask_adds_epu8_rmkz_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdc,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <16 x i8>, <16 x i8>* %ptr_b
	%res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
	ret <16 x i8> %res
	}

	declare <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)

	define <32 x i8> @test_mask_adds_epu8_rr_256(<32 x i8> %a, <32 x i8> %b) {
	; CHECK-LABEL: test_mask_adds_epu8_rr_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdc,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
	ret <32 x i8> %res
	}

	define <32 x i8> @test_mask_adds_epu8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) {
	; CHECK-LABEL: test_mask_adds_epu8_rrk_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpaddusb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdc,0xd1]
	; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask)
	ret <32 x i8> %res
	}

	define <32 x i8> @test_mask_adds_epu8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
	; CHECK-LABEL: test_mask_adds_epu8_rrkz_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdc,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask)
	ret <32 x i8> %res
	}

	define <32 x i8> @test_mask_adds_epu8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) {
	; CHECK-LABEL: test_mask_adds_epu8_rm_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpaddusb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdc,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <32 x i8>, <32 x i8>* %ptr_b
	%res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
	ret <32 x i8> %res
	}

	define <32 x i8> @test_mask_adds_epu8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
	; CHECK-LABEL: test_mask_adds_epu8_rmk_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpaddusb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdc,0x0f]
	; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <32 x i8>, <32 x i8>* %ptr_b
	%res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask)
	ret <32 x i8> %res
	}

	define <32 x i8> @test_mask_adds_epu8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) {
	; CHECK-LABEL: test_mask_adds_epu8_rmkz_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpaddusb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdc,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <32 x i8>, <32 x i8>* %ptr_b
	%res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask)
	ret <32 x i8> %res
	}

	declare <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)

	define <16 x i8> @test_mask_subs_epu8_rr_128(<16 x i8> %a, <16 x i8> %b) {
	; CHECK-LABEL: test_mask_subs_epu8_rr_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd8,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
	ret <16 x i8> %res
	}

	define <16 x i8> @test_mask_subs_epu8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) {
	; CHECK-LABEL: test_mask_subs_epu8_rrk_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpsubusb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd8,0xd1]
	; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask)
	ret <16 x i8> %res
	}

	define <16 x i8> @test_mask_subs_epu8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
	; CHECK-LABEL: test_mask_subs_epu8_rrkz_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd8,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
	ret <16 x i8> %res
	}

	define <16 x i8> @test_mask_subs_epu8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) {
	; CHECK-LABEL: test_mask_subs_epu8_rm_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd8,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <16 x i8>, <16 x i8>* %ptr_b
	%res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
	ret <16 x i8> %res
	}

	define <16 x i8> @test_mask_subs_epu8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
	; CHECK-LABEL: test_mask_subs_epu8_rmk_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpsubusb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd8,0x0f]
	; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <16 x i8>, <16 x i8>* %ptr_b
	%res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask)
	ret <16 x i8> %res
	}

	define <16 x i8> @test_mask_subs_epu8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) {
	; CHECK-LABEL: test_mask_subs_epu8_rmkz_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd8,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <16 x i8>, <16 x i8>* %ptr_b
	%res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
	ret <16 x i8> %res
	}

	declare <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)

	define <32 x i8> @test_mask_subs_epu8_rr_256(<32 x i8> %a, <32 x i8> %b) {
	; CHECK-LABEL: test_mask_subs_epu8_rr_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd8,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
	ret <32 x i8> %res
	}

	define <32 x i8> @test_mask_subs_epu8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) {
	; CHECK-LABEL: test_mask_subs_epu8_rrk_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpsubusb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd8,0xd1]
	; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask)
	ret <32 x i8> %res
	}

	define <32 x i8> @test_mask_subs_epu8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
	; CHECK-LABEL: test_mask_subs_epu8_rrkz_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd8,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask)
	ret <32 x i8> %res
	}

	define <32 x i8> @test_mask_subs_epu8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) {
	; CHECK-LABEL: test_mask_subs_epu8_rm_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpsubusb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd8,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <32 x i8>, <32 x i8>* %ptr_b
	%res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
	ret <32 x i8> %res
	}

	define <32 x i8> @test_mask_subs_epu8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
	; CHECK-LABEL: test_mask_subs_epu8_rmk_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpsubusb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd8,0x0f]
	; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <32 x i8>, <32 x i8>* %ptr_b
	%res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask)
	ret <32 x i8> %res
	}

	define <32 x i8> @test_mask_subs_epu8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) {
	; CHECK-LABEL: test_mask_subs_epu8_rmkz_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpsubusb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd8,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%b = load <32 x i8>, <32 x i8>* %ptr_b
	%res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask)
	ret <32 x i8> %res
	}

	declare <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)

	declare <8 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)

	define <8 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
	; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vmovdqa %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9]
	; CHECK-NEXT: vpermt2w %xmm2, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0xfd,0x08,0x7d,0xda]
	; CHECK-NEXT: vpermt2w %xmm2, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x7d,0xca]
	; CHECK-NEXT: vpaddw %xmm3, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc3]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <8 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
	%res1 = call <8 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
	%res2 = add <8 x i16> %res, %res1
	ret <8 x i16> %res2
	}

	declare <8 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)

	define <8 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
	; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vmovdqa %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9]
	; CHECK-NEXT: vpermt2w %xmm2, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0xfd,0x08,0x7d,0xda]
	; CHECK-NEXT: vpermt2w %xmm2, %xmm0, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x7d,0xca]
	; CHECK-NEXT: vpaddw %xmm3, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc3]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <8 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
	%res1 = call <8 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
	%res2 = add <8 x i16> %res, %res1
	ret <8 x i16> %res2
	}

	declare <16 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)

	define <16 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
	; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vmovdqa %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9]
	; CHECK-NEXT: vpermt2w %ymm2, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0xfd,0x28,0x7d,0xda]
	; CHECK-NEXT: vpermt2w %ymm2, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x7d,0xca]
	; CHECK-NEXT: vpaddw %ymm3, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc3]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <16 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
	%res1 = call <16 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
	%res2 = add <16 x i16> %res, %res1
	ret <16 x i16> %res2
	}

	declare <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)

	define <16 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
	; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vmovdqa %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9]
	; CHECK-NEXT: vpermt2w %ymm2, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0xfd,0x28,0x7d,0xda]
	; CHECK-NEXT: vpermt2w %ymm2, %ymm0, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x7d,0xca]
	; CHECK-NEXT: vpaddw %ymm3, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc3]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
	%res1 = call <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
	%res2 = add <16 x i16> %res, %res1
	ret <16 x i16> %res2
	}

	declare <8 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)

	define <8 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
	; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vmovdqa %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9]
	; CHECK-NEXT: vpermi2w %xmm2, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0xfd,0x08,0x75,0xda]
	; CHECK-NEXT: vpermi2w %xmm2, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x75,0xca]
	; CHECK-NEXT: vpaddw %xmm3, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc3]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <8 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
	%res1 = call <8 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
	%res2 = add <8 x i16> %res, %res1
	ret <8 x i16> %res2
	}

	declare <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)

	define <16 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
	; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vmovdqa %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9]
	; CHECK-NEXT: vpermi2w %ymm2, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0xfd,0x28,0x75,0xda]
	; CHECK-NEXT: vpermi2w %ymm2, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x75,0xca]
	; CHECK-NEXT: vpaddw %ymm3, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc3]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
	%res1 = call <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
	%res2 = add <16 x i16> %res, %res1
	ret <16 x i16> %res2
	}

	declare <16 x i8> @llvm.x86.avx512.mask.pavg.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)

	define <16 x i8>@test_int_x86_avx512_mask_pavg_b_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) {
	; CHECK-LABEL: test_int_x86_avx512_mask_pavg_b_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpavgb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe0,0xd1]
	; CHECK-NEXT: vpavgb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe0,0xc1]
	; CHECK-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <16 x i8> @llvm.x86.avx512.mask.pavg.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3)
	%res1 = call <16 x i8> @llvm.x86.avx512.mask.pavg.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1)
	%res2 = add <16 x i8> %res, %res1
	ret <16 x i8> %res2
	}

	declare <32 x i8> @llvm.x86.avx512.mask.pavg.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)

	define <32 x i8>@test_int_x86_avx512_mask_pavg_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
	; CHECK-LABEL: test_int_x86_avx512_mask_pavg_b_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpavgb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe0,0xd1]
	; CHECK-NEXT: vpavgb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe0,0xc1]
	; CHECK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc0]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <32 x i8> @llvm.x86.avx512.mask.pavg.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
	%res1 = call <32 x i8> @llvm.x86.avx512.mask.pavg.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
	%res2 = add <32 x i8> %res, %res1
	ret <32 x i8> %res2
	}

	declare <8 x i16> @llvm.x86.avx512.mask.pavg.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)

	define <8 x i16>@test_int_x86_avx512_mask_pavg_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
	; CHECK-LABEL: test_int_x86_avx512_mask_pavg_w_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpavgw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe3,0xd1]
	; CHECK-NEXT: vpavgw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe3,0xc1]
	; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <8 x i16> @llvm.x86.avx512.mask.pavg.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
	%res1 = call <8 x i16> @llvm.x86.avx512.mask.pavg.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
	%res2 = add <8 x i16> %res, %res1
	ret <8 x i16> %res2
	}

	declare <16 x i16> @llvm.x86.avx512.mask.pavg.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)

	define <16 x i16>@test_int_x86_avx512_mask_pavg_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
	; CHECK-LABEL: test_int_x86_avx512_mask_pavg_w_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpavgw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe3,0xd1]
	; CHECK-NEXT: vpavgw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe3,0xc1]
	; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <16 x i16> @llvm.x86.avx512.mask.pavg.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
	%res1 = call <16 x i16> @llvm.x86.avx512.mask.pavg.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
	%res2 = add <16 x i16> %res, %res1
	ret <16 x i16> %res2
	}

	declare <16 x i8> @llvm.x86.avx512.mask.pabs.b.128(<16 x i8>, <16 x i8>, i16)

	define <16 x i8>@test_int_x86_avx512_mask_pabs_b_128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2) {
	; CHECK-LABEL: test_int_x86_avx512_mask_pabs_b_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpabsb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x1c,0xc8]
	; CHECK-NEXT: vpabsb %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x1c,0xc0]
	; CHECK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <16 x i8> @llvm.x86.avx512.mask.pabs.b.128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2)
	%res1 = call <16 x i8> @llvm.x86.avx512.mask.pabs.b.128(<16 x i8> %x0, <16 x i8> %x1, i16 -1)
	%res2 = add <16 x i8> %res, %res1
	ret <16 x i8> %res2
	}

	declare <32 x i8> @llvm.x86.avx512.mask.pabs.b.256(<32 x i8>, <32 x i8>, i32)

	define <32 x i8>@test_int_x86_avx512_mask_pabs_b_256(<32 x i8> %x0, <32 x i8> %x1, i32 %x2) {
	; CHECK-LABEL: test_int_x86_avx512_mask_pabs_b_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpabsb %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x1c,0xc8]
	; CHECK-NEXT: vpabsb %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x1c,0xc0]
	; CHECK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <32 x i8> @llvm.x86.avx512.mask.pabs.b.256(<32 x i8> %x0, <32 x i8> %x1, i32 %x2)
	%res1 = call <32 x i8> @llvm.x86.avx512.mask.pabs.b.256(<32 x i8> %x0, <32 x i8> %x1, i32 -1)
	%res2 = add <32 x i8> %res, %res1
	ret <32 x i8> %res2
	}

	declare <8 x i16> @llvm.x86.avx512.mask.pabs.w.128(<8 x i16>, <8 x i16>, i8)

	define <8 x i16>@test_int_x86_avx512_mask_pabs_w_128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2) {
	; CHECK-LABEL: test_int_x86_avx512_mask_pabs_w_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpabsw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x1d,0xc8]
	; CHECK-NEXT: vpabsw %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x1d,0xc0]
	; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <8 x i16> @llvm.x86.avx512.mask.pabs.w.128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2)
	%res1 = call <8 x i16> @llvm.x86.avx512.mask.pabs.w.128(<8 x i16> %x0, <8 x i16> %x1, i8 -1)
	%res2 = add <8 x i16> %res, %res1
	ret <8 x i16> %res2
	}

	declare <16 x i16> @llvm.x86.avx512.mask.pabs.w.256(<16 x i16>, <16 x i16>, i16)

	define <16 x i16>@test_int_x86_avx512_mask_pabs_w_256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2) {
	; CHECK-LABEL: test_int_x86_avx512_mask_pabs_w_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpabsw %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x1d,0xc8]
	; CHECK-NEXT: vpabsw %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x1d,0xc0]
	; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <16 x i16> @llvm.x86.avx512.mask.pabs.w.256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2)
	%res1 = call <16 x i16> @llvm.x86.avx512.mask.pabs.w.256(<16 x i16> %x0, <16 x i16> %x1, i16 -1)
	%res2 = add <16 x i16> %res, %res1
	ret <16 x i16> %res2
	}

	declare <8 x i16> @llvm.x86.avx512.mask.pmulhu.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)

	define <8 x i16>@test_int_x86_avx512_mask_pmulhu_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
	; CHECK-LABEL: test_int_x86_avx512_mask_pmulhu_w_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe4,0xd1]
	; CHECK-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe4,0xc1]
	; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <8 x i16> @llvm.x86.avx512.mask.pmulhu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
	%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmulhu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
	%res2 = add <8 x i16> %res, %res1
	ret <8 x i16> %res2
	}

	declare <16 x i16> @llvm.x86.avx512.mask.pmulhu.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)

	define <16 x i16>@test_int_x86_avx512_mask_pmulhu_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
	; CHECK-LABEL: test_int_x86_avx512_mask_pmulhu_w_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe4,0xd1]
	; CHECK-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe4,0xc1]
	; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <16 x i16> @llvm.x86.avx512.mask.pmulhu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
	%res1 = call <16 x i16> @llvm.x86.avx512.mask.pmulhu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
	%res2 = add <16 x i16> %res, %res1
	ret <16 x i16> %res2
	}

	declare <8 x i16> @llvm.x86.avx512.mask.pmulh.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)

	define <8 x i16>@test_int_x86_avx512_mask_pmulh_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
	; CHECK-LABEL: test_int_x86_avx512_mask_pmulh_w_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpmulhw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe5,0xd1]
	; CHECK-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe5,0xc1]
	; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <8 x i16> @llvm.x86.avx512.mask.pmulh.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
	%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmulh.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
	%res2 = add <8 x i16> %res, %res1
	ret <8 x i16> %res2
	}

	declare <16 x i16> @llvm.x86.avx512.mask.pmulh.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)

	define <16 x i16>@test_int_x86_avx512_mask_pmulh_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
	; CHECK-LABEL: test_int_x86_avx512_mask_pmulh_w_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpmulhw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe5,0xd1]
	; CHECK-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe5,0xc1]
	; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <16 x i16> @llvm.x86.avx512.mask.pmulh.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
	%res1 = call <16 x i16> @llvm.x86.avx512.mask.pmulh.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
	%res2 = add <16 x i16> %res, %res1
	ret <16 x i16> %res2
	}

	declare <8 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)

	define <8 x i16>@test_int_x86_avx512_mask_pmulhr_sw_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
	; CHECK-LABEL: test_int_x86_avx512_mask_pmulhr_sw_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x0b,0xd1]
	; CHECK-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x0b,0xc1]
	; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <8 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
	%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
	%res2 = add <8 x i16> %res, %res1
	ret <8 x i16> %res2
	}

	declare <16 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)

	define <16 x i16>@test_int_x86_avx512_mask_pmulhr_sw_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
	; CHECK-LABEL: test_int_x86_avx512_mask_pmulhr_sw_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpmulhrsw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x0b,0xd1]
	; CHECK-NEXT: vpmulhrsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x0b,0xc1]
	; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <16 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
	%res1 = call <16 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
	%res2 = add <16 x i16> %res, %res1
	ret <16 x i16> %res2
	}

	declare <16 x i8> @llvm.x86.avx512.mask.pmov.wb.128(<8 x i16>, <16 x i8>, i8)

	define <16 x i8>@test_int_x86_avx512_mask_pmov_wb_128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) {
	; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpmovwb %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x30,0xc2]
	; CHECK-NEXT: vpmovwb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x30,0xc1]
	; CHECK-NEXT: vpmovwb %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x30,0xc0]
	; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc1]
	; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc2]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 -1)
	%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2)
	%res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.128(<8 x i16> %x0, <16 x i8> zeroinitializer, i8 %x2)
	%res3 = add <16 x i8> %res0, %res1
	%res4 = add <16 x i8> %res3, %res2
	ret <16 x i8> %res4
	}

	declare void @llvm.x86.avx512.mask.pmov.wb.mem.128(i8* %ptr, <8 x i16>, i8)

	define void @test_int_x86_avx512_mask_pmov_wb_mem_128(i8* %ptr, <8 x i16> %x1, i8 %x2) {
	; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_mem_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpmovwb %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x30,0x07]
	; CHECK-NEXT: vpmovwb %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x30,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	call void @llvm.x86.avx512.mask.pmov.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 -1)
	call void @llvm.x86.avx512.mask.pmov.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 %x2)
	ret void
	}

	declare <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.128(<8 x i16>, <16 x i8>, i8)

	define <16 x i8>@test_int_x86_avx512_mask_pmovs_wb_128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) {
	; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpmovswb %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x20,0xc2]
	; CHECK-NEXT: vpmovswb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x20,0xc1]
	; CHECK-NEXT: vpmovswb %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x20,0xc0]
	; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc1]
	; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc2]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 -1)
	%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2)
	%res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.128(<8 x i16> %x0, <16 x i8> zeroinitializer, i8 %x2)
	%res3 = add <16 x i8> %res0, %res1
	%res4 = add <16 x i8> %res3, %res2
	ret <16 x i8> %res4
	}

	declare void @llvm.x86.avx512.mask.pmovs.wb.mem.128(i8* %ptr, <8 x i16>, i8)

	define void @test_int_x86_avx512_mask_pmovs_wb_mem_128(i8* %ptr, <8 x i16> %x1, i8 %x2) {
	; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_mem_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpmovswb %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x20,0x07]
	; CHECK-NEXT: vpmovswb %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x20,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	call void @llvm.x86.avx512.mask.pmovs.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 -1)
	call void @llvm.x86.avx512.mask.pmovs.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 %x2)
	ret void
	}

	declare <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.128(<8 x i16>, <16 x i8>, i8)

	define <16 x i8>@test_int_x86_avx512_mask_pmovus_wb_128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) {
	; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpmovuswb %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x10,0xc2]
	; CHECK-NEXT: vpmovuswb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x10,0xc1]
	; CHECK-NEXT: vpmovuswb %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x10,0xc0]
	; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc1]
	; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc2]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 -1)
	%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2)
	%res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.128(<8 x i16> %x0, <16 x i8> zeroinitializer, i8 %x2)
	%res3 = add <16 x i8> %res0, %res1
	%res4 = add <16 x i8> %res3, %res2
	ret <16 x i8> %res4
	}

	declare void @llvm.x86.avx512.mask.pmovus.wb.mem.128(i8* %ptr, <8 x i16>, i8)

	define void @test_int_x86_avx512_mask_pmovus_wb_mem_128(i8* %ptr, <8 x i16> %x1, i8 %x2) {
	; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_mem_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpmovuswb %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x10,0x07]
	; CHECK-NEXT: vpmovuswb %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x10,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	call void @llvm.x86.avx512.mask.pmovus.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 -1)
	call void @llvm.x86.avx512.mask.pmovus.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 %x2)
	ret void
	}

	declare <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16>, <16 x i8>, i16)

	define <16 x i8>@test_int_x86_avx512_mask_pmov_wb_256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) {
	; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpmovwb %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x30,0xc2]
	; CHECK-NEXT: vpmovwb %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x30,0xc1]
	; CHECK-NEXT: vpmovwb %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x30,0xc0]
	; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc1]
	; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc2]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 -1)
	%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2)
	%res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16> %x0, <16 x i8> zeroinitializer, i16 %x2)
	%res3 = add <16 x i8> %res0, %res1
	%res4 = add <16 x i8> %res3, %res2
	ret <16 x i8> %res4
	}

	declare void @llvm.x86.avx512.mask.pmov.wb.mem.256(i8* %ptr, <16 x i16>, i16)

	define void @test_int_x86_avx512_mask_pmov_wb_mem_256(i8* %ptr, <16 x i16> %x1, i16 %x2) {
	; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_mem_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpmovwb %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x30,0x07]
	; CHECK-NEXT: vpmovwb %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x30,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	call void @llvm.x86.avx512.mask.pmov.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 -1)
	call void @llvm.x86.avx512.mask.pmov.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 %x2)
	ret void
	}

	declare <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16>, <16 x i8>, i16)

	define <16 x i8>@test_int_x86_avx512_mask_pmovs_wb_256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) {
	; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpmovswb %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x20,0xc2]
	; CHECK-NEXT: vpmovswb %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x20,0xc1]
	; CHECK-NEXT: vpmovswb %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x20,0xc0]
	; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc1]
	; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc2]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 -1)
	%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2)
	%res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16> %x0, <16 x i8> zeroinitializer, i16 %x2)
	%res3 = add <16 x i8> %res0, %res1
	%res4 = add <16 x i8> %res3, %res2
	ret <16 x i8> %res4
	}

	declare void @llvm.x86.avx512.mask.pmovs.wb.mem.256(i8* %ptr, <16 x i16>, i16)

	define void @test_int_x86_avx512_mask_pmovs_wb_mem_256(i8* %ptr, <16 x i16> %x1, i16 %x2) {
	; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_mem_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpmovswb %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x20,0x07]
	; CHECK-NEXT: vpmovswb %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x20,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	call void @llvm.x86.avx512.mask.pmovs.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 -1)
	call void @llvm.x86.avx512.mask.pmovs.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 %x2)
	ret void
	}

	declare <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16>, <16 x i8>, i16)

	define <16 x i8>@test_int_x86_avx512_mask_pmovus_wb_256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) {
	; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpmovuswb %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x10,0xc2]
	; CHECK-NEXT: vpmovuswb %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x10,0xc1]
	; CHECK-NEXT: vpmovuswb %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x10,0xc0]
	; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc1]
	; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc2]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 -1)
	%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2)
	%res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16> %x0, <16 x i8> zeroinitializer, i16 %x2)
	%res3 = add <16 x i8> %res0, %res1
	%res4 = add <16 x i8> %res3, %res2
	ret <16 x i8> %res4
	}

	declare void @llvm.x86.avx512.mask.pmovus.wb.mem.256(i8* %ptr, <16 x i16>, i16)

	define void @test_int_x86_avx512_mask_pmovus_wb_mem_256(i8* %ptr, <16 x i16> %x1, i16 %x2) {
	; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_mem_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	; CHECK-NEXT: vpmovuswb %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x10,0x07]
	; CHECK-NEXT: vpmovuswb %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x10,0x07]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	call void @llvm.x86.avx512.mask.pmovus.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 -1)
	call void @llvm.x86.avx512.mask.pmovus.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 %x2)
	ret void
	}

	declare <4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128(<8 x i16>, <8 x i16>, <4 x i32>, i8)

	define <4 x i32>@test_int_x86_avx512_mask_pmaddw_d_128(<8 x i16> %x0, <8 x i16> %x1, <4 x i32> %x2, i8 %x3) {
	; CHECK-LABEL: test_int_x86_avx512_mask_pmaddw_d_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpmaddwd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xf5,0xd1]
	; CHECK-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf5,0xc1]
	; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128(<8 x i16> %x0, <8 x i16> %x1, <4 x i32> %x2, i8 %x3)
	%res1 = call <4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128(<8 x i16> %x0, <8 x i16> %x1, <4 x i32> %x2, i8 -1)
	%res2 = add <4 x i32> %res, %res1
	ret <4 x i32> %res2
	}

	declare <8 x i32> @llvm.x86.avx512.mask.pmaddw.d.256(<16 x i16>, <16 x i16>, <8 x i32>, i8)

	define <8 x i32>@test_int_x86_avx512_mask_pmaddw_d_256(<16 x i16> %x0, <16 x i16> %x1, <8 x i32> %x2, i8 %x3) {
	; CHECK-LABEL: test_int_x86_avx512_mask_pmaddw_d_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpmaddwd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xf5,0xd1]
	; CHECK-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf5,0xc1]
	; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <8 x i32> @llvm.x86.avx512.mask.pmaddw.d.256(<16 x i16> %x0, <16 x i16> %x1, <8 x i32> %x2, i8 %x3)
	%res1 = call <8 x i32> @llvm.x86.avx512.mask.pmaddw.d.256(<16 x i16> %x0, <16 x i16> %x1, <8 x i32> %x2, i8 -1)
	%res2 = add <8 x i32> %res, %res1
	ret <8 x i32> %res2
	}

	declare <8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128(<16 x i8>, <16 x i8>, <8 x i16>, i8)

	define <8 x i16>@test_int_x86_avx512_mask_pmaddubs_w_128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x2, i8 %x3) {
	; CHECK-LABEL: test_int_x86_avx512_mask_pmaddubs_w_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x04,0xd1]
	; CHECK-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x04,0xc1]
	; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x2, i8 %x3)
	%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x2, i8 -1)
	%res2 = add <8 x i16> %res, %res1
	ret <8 x i16> %res2
	}

	declare <16 x i16> @llvm.x86.avx512.mask.pmaddubs.w.256(<32 x i8>, <32 x i8>, <16 x i16>, i16)

	define <16 x i16>@test_int_x86_avx512_mask_pmaddubs_w_256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x2, i16 %x3) {
	; CHECK-LABEL: test_int_x86_avx512_mask_pmaddubs_w_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x04,0xd1]
	; CHECK-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x04,0xc1]
	; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <16 x i16> @llvm.x86.avx512.mask.pmaddubs.w.256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x2, i16 %x3)
	%res1 = call <16 x i16> @llvm.x86.avx512.mask.pmaddubs.w.256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x2, i16 -1)
	%res2 = add <16 x i16> %res, %res1
	ret <16 x i16> %res2
	}

	declare <8 x i16> @llvm.x86.avx512.mask.dbpsadbw.128(<16 x i8>, <16 x i8>, i32, <8 x i16>, i8)

	define <8 x i16>@test_int_x86_avx512_mask_dbpsadbw_128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x3, i8 %x4) {
	; CHECK-LABEL: test_int_x86_avx512_mask_dbpsadbw_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vdbpsadbw $2, %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf3,0x7d,0x08,0x42,0xd9,0x02]
	; CHECK-NEXT: vdbpsadbw $2, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x42,0xd1,0x02]
	; CHECK-NEXT: vdbpsadbw $2, %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0x89,0x42,0xc1,0x02]
	; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
	; CHECK-NEXT: vpaddw %xmm0, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfd,0xc0]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <8 x i16> @llvm.x86.avx512.mask.dbpsadbw.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <8 x i16> %x3, i8 %x4)
	%res1 = call <8 x i16> @llvm.x86.avx512.mask.dbpsadbw.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <8 x i16> zeroinitializer, i8 %x4)
	%res2 = call <8 x i16> @llvm.x86.avx512.mask.dbpsadbw.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <8 x i16> %x3, i8 -1)
	%res3 = add <8 x i16> %res, %res1
	%res4 = add <8 x i16> %res2, %res3
	ret <8 x i16> %res4
	}

	declare <16 x i16> @llvm.x86.avx512.mask.dbpsadbw.256(<32 x i8>, <32 x i8>, i32, <16 x i16>, i16)

	define <16 x i16>@test_int_x86_avx512_mask_dbpsadbw_256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x3, i16 %x4) {
	; CHECK-LABEL: test_int_x86_avx512_mask_dbpsadbw_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vdbpsadbw $2, %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf3,0x7d,0x28,0x42,0xd9,0x02]
	; CHECK-NEXT: vdbpsadbw $2, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x42,0xd1,0x02]
	; CHECK-NEXT: vdbpsadbw $2, %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x42,0xc1,0x02]
	; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
	; CHECK-NEXT: vpaddw %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc3]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <16 x i16> @llvm.x86.avx512.mask.dbpsadbw.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <16 x i16> %x3, i16 %x4)
	%res1 = call <16 x i16> @llvm.x86.avx512.mask.dbpsadbw.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <16 x i16> zeroinitializer, i16 %x4)
	%res2 = call <16 x i16> @llvm.x86.avx512.mask.dbpsadbw.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <16 x i16> %x3, i16 -1)
	%res3 = add <16 x i16> %res, %res1
	%res4 = add <16 x i16> %res3, %res2
	ret <16 x i16> %res4
	}

	declare i16 @llvm.x86.avx512.cvtb2mask.128(<16 x i8>)

	define i16@test_int_x86_avx512_cvtb2mask_128(<16 x i8> %x0) {
	; CHECK-LABEL: test_int_x86_avx512_cvtb2mask_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpmovb2m %xmm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x08,0x29,0xc0]
	; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
	; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call i16 @llvm.x86.avx512.cvtb2mask.128(<16 x i8> %x0)
	ret i16 %res
	}

	declare i32 @llvm.x86.avx512.cvtb2mask.256(<32 x i8>)

	define i32@test_int_x86_avx512_cvtb2mask_256(<32 x i8> %x0) {
	; CHECK-LABEL: test_int_x86_avx512_cvtb2mask_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpmovb2m %ymm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x28,0x29,0xc0]
	; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call i32 @llvm.x86.avx512.cvtb2mask.256(<32 x i8> %x0)
	ret i32 %res
	}

	declare i8 @llvm.x86.avx512.cvtw2mask.128(<8 x i16>)

	define i8@test_int_x86_avx512_cvtw2mask_128(<8 x i16> %x0) {
	; CHECK-LABEL: test_int_x86_avx512_cvtw2mask_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpmovw2m %xmm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x08,0x29,0xc0]
	; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
	; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call i8 @llvm.x86.avx512.cvtw2mask.128(<8 x i16> %x0)
	ret i8 %res
	}

	declare i16 @llvm.x86.avx512.cvtw2mask.256(<16 x i16>)

	define i16@test_int_x86_avx512_cvtw2mask_256(<16 x i16> %x0) {
	; CHECK-LABEL: test_int_x86_avx512_cvtw2mask_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpmovw2m %ymm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x28,0x29,0xc0]
	; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
	; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call i16 @llvm.x86.avx512.cvtw2mask.256(<16 x i16> %x0)
	ret i16 %res
	}

	declare <16 x i16> @llvm.x86.avx512.mask.psrlv16.hi(<16 x i16>, <16 x i16>, <16 x i16>, i16)

	define <16 x i16>@test_int_x86_avx512_mask_psrlv16_hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
	; CHECK-LABEL: test_int_x86_avx512_mask_psrlv16_hi:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpsrlvw %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0xfd,0x28,0x10,0xd9]
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpsrlvw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x10,0xd1]
	; CHECK-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x10,0xc1]
	; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
	; CHECK-NEXT: vpaddw %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc3]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <16 x i16> @llvm.x86.avx512.mask.psrlv16.hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
	%res1 = call <16 x i16> @llvm.x86.avx512.mask.psrlv16.hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %x3)
	%res2 = call <16 x i16> @llvm.x86.avx512.mask.psrlv16.hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
	%res3 = add <16 x i16> %res, %res1
	%res4 = add <16 x i16> %res3, %res2
	ret <16 x i16> %res4
	}

	declare <8 x i16> @llvm.x86.avx512.mask.psrlv8.hi(<8 x i16>, <8 x i16>, <8 x i16>, i8)

	define <8 x i16>@test_int_x86_avx512_mask_psrlv8_hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
	; CHECK-LABEL: test_int_x86_avx512_mask_psrlv8_hi:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpsrlvw %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0xfd,0x08,0x10,0xd9]
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpsrlvw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x10,0xd1]
	; CHECK-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x10,0xc1]
	; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
	; CHECK-NEXT: vpaddw %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc3]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <8 x i16> @llvm.x86.avx512.mask.psrlv8.hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
	%res1 = call <8 x i16> @llvm.x86.avx512.mask.psrlv8.hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> zeroinitializer, i8 %x3)
	%res2 = call <8 x i16> @llvm.x86.avx512.mask.psrlv8.hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
	%res3 = add <8 x i16> %res, %res1
	%res4 = add <8 x i16> %res3, %res2
	ret <8 x i16> %res4
	}

	declare <16 x i16> @llvm.x86.avx512.mask.psrav16.hi(<16 x i16>, <16 x i16>, <16 x i16>, i16)

	define <16 x i16>@test_int_x86_avx512_mask_psrav16_hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
	; CHECK-LABEL: test_int_x86_avx512_mask_psrav16_hi:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpsravw %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0xfd,0x28,0x11,0xd9]
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpsravw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x11,0xd1]
	; CHECK-NEXT: vpsravw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x11,0xc1]
	; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
	; CHECK-NEXT: vpaddw %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc3]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <16 x i16> @llvm.x86.avx512.mask.psrav16.hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
	%res1 = call <16 x i16> @llvm.x86.avx512.mask.psrav16.hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %x3)
	%res2 = call <16 x i16> @llvm.x86.avx512.mask.psrav16.hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
	%res3 = add <16 x i16> %res, %res1
	%res4 = add <16 x i16> %res3, %res2
	ret <16 x i16> %res4
	}

	declare <8 x i16> @llvm.x86.avx512.mask.psrav8.hi(<8 x i16>, <8 x i16>, <8 x i16>, i8)

	define <8 x i16>@test_int_x86_avx512_mask_psrav8_hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
	; CHECK-LABEL: test_int_x86_avx512_mask_psrav8_hi:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpsravw %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0xfd,0x08,0x11,0xd9]
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpsravw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x11,0xd1]
	; CHECK-NEXT: vpsravw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x11,0xc1]
	; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
	; CHECK-NEXT: vpaddw %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc3]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <8 x i16> @llvm.x86.avx512.mask.psrav8.hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
	%res1 = call <8 x i16> @llvm.x86.avx512.mask.psrav8.hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> zeroinitializer, i8 %x3)
	%res2 = call <8 x i16> @llvm.x86.avx512.mask.psrav8.hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
	%res3 = add <8 x i16> %res, %res1
	%res4 = add <8 x i16> %res3, %res2
	ret <8 x i16> %res4
	}

	declare <16 x i16> @llvm.x86.avx512.mask.psllv16.hi(<16 x i16>, <16 x i16>, <16 x i16>, i16)

	define <16 x i16>@test_int_x86_avx512_mask_psllv16_hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
	; CHECK-LABEL: test_int_x86_avx512_mask_psllv16_hi:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpsllvw %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0xfd,0x28,0x12,0xd9]
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpsllvw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x12,0xd1]
	; CHECK-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x12,0xc1]
	; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
	; CHECK-NEXT: vpaddw %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc3]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <16 x i16> @llvm.x86.avx512.mask.psllv16.hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
	%res1 = call <16 x i16> @llvm.x86.avx512.mask.psllv16.hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %x3)
	%res2 = call <16 x i16> @llvm.x86.avx512.mask.psllv16.hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
	%res3 = add <16 x i16> %res, %res1
	%res4 = add <16 x i16> %res3, %res2
	ret <16 x i16> %res4
	}

	declare <8 x i16> @llvm.x86.avx512.mask.psllv8.hi(<8 x i16>, <8 x i16>, <8 x i16>, i8)

	define <8 x i16>@test_int_x86_avx512_mask_psllv8_hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
	; CHECK-LABEL: test_int_x86_avx512_mask_psllv8_hi:
	; CHECK: ## BB#0:
	; CHECK-NEXT: vpsllvw %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0xfd,0x08,0x12,0xd9]
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpsllvw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x12,0xd1]
	; CHECK-NEXT: vpsllvw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x12,0xc1]
	; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
	; CHECK-NEXT: vpaddw %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc3]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <8 x i16> @llvm.x86.avx512.mask.psllv8.hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
	%res1 = call <8 x i16> @llvm.x86.avx512.mask.psllv8.hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> zeroinitializer, i8 %x3)
	%res2 = call <8 x i16> @llvm.x86.avx512.mask.psllv8.hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
	%res3 = add <8 x i16> %res, %res1
	%res4 = add <8 x i16> %res3, %res2
	ret <8 x i16> %res4
	}

	declare <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)

	define <8 x i16>@test_int_x86_avx512_mask_permvar_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
	; CHECK-LABEL: test_int_x86_avx512_mask_permvar_hi_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpermw %xmm0, %xmm1, %xmm3 ## encoding: [0x62,0xf2,0xf5,0x08,0x8d,0xd8]
	; CHECK-NEXT: vpermw %xmm0, %xmm1, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0x8d,0xd0]
	; CHECK-NEXT: vpermw %xmm0, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0x89,0x8d,0xc0]
	; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
	; CHECK-NEXT: vpaddw %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc3]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
	%res1 = call <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> zeroinitializer, i8 %x3)
	%res2 = call <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
	%res3 = add <8 x i16> %res, %res1
	%res4 = add <8 x i16> %res3, %res2
	ret <8 x i16> %res4
	}

	declare <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)

	define <16 x i16>@test_int_x86_avx512_mask_permvar_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
	; CHECK-LABEL: test_int_x86_avx512_mask_permvar_hi_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm3 ## encoding: [0x62,0xf2,0xf5,0x28,0x8d,0xd8]
	; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0x8d,0xd0]
	; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xa9,0x8d,0xc0]
	; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
	; CHECK-NEXT: vpaddw %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc3]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
	%res1 = call <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %x3)
	%res2 = call <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
	%res3 = add <16 x i16> %res, %res1
	%res4 = add <16 x i16> %res3, %res2
	ret <16 x i16> %res4
	}

	declare i16 @llvm.x86.avx512.ptestm.b.128(<16 x i8>, <16 x i8>, i16)

	define i16@test_int_x86_avx512_ptestm_b_128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2) {
	; CHECK-LABEL: test_int_x86_avx512_ptestm_b_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vptestmb %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0x7d,0x08,0x26,0xc1]
	; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
	; CHECK-NEXT: vptestmb %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x26,0xc1]
	; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
	; CHECK-NEXT: addl %ecx, %eax ## encoding: [0x01,0xc8]
	; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call i16 @llvm.x86.avx512.ptestm.b.128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2)
	%res1 = call i16 @llvm.x86.avx512.ptestm.b.128(<16 x i8> %x0, <16 x i8> %x1, i16-1)
	%res2 = add i16 %res, %res1
	ret i16 %res2
	}

	declare i32 @llvm.x86.avx512.ptestm.b.256(<32 x i8>, <32 x i8>, i32)

	define i32@test_int_x86_avx512_ptestm_b_256(<32 x i8> %x0, <32 x i8> %x1, i32 %x2) {
	; CHECK-LABEL: test_int_x86_avx512_ptestm_b_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vptestmb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x26,0xc1]
	; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
	; CHECK-NEXT: vptestmb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0x7d,0x28,0x26,0xc1]
	; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
	; CHECK-NEXT: addl %ecx, %eax ## encoding: [0x01,0xc8]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call i32 @llvm.x86.avx512.ptestm.b.256(<32 x i8> %x0, <32 x i8> %x1, i32 %x2)
	%res1 = call i32 @llvm.x86.avx512.ptestm.b.256(<32 x i8> %x0, <32 x i8> %x1, i32-1)
	%res2 = add i32 %res, %res1
	ret i32 %res2
	}

	declare i8 @llvm.x86.avx512.ptestm.w.128(<8 x i16>, <8 x i16>, i8)

	define i8@test_int_x86_avx512_ptestm_w_128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2) {
	; CHECK-LABEL: test_int_x86_avx512_ptestm_w_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vptestmw %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x08,0x26,0xc1]
	; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
	; CHECK-NEXT: vptestmw %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x26,0xc1]
	; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
	; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8]
	; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call i8 @llvm.x86.avx512.ptestm.w.128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2)
	%res1 = call i8 @llvm.x86.avx512.ptestm.w.128(<8 x i16> %x0, <8 x i16> %x1, i8-1)
	%res2 = add i8 %res, %res1
	ret i8 %res2
	}

	declare i16 @llvm.x86.avx512.ptestm.w.256(<16 x i16>, <16 x i16>, i16)

	define i16@test_int_x86_avx512_ptestm_w_256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2) {
	; CHECK-LABEL: test_int_x86_avx512_ptestm_w_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vptestmw %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x28,0x26,0xc1]
	; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
	; CHECK-NEXT: vptestmw %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x26,0xc1]
	; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
	; CHECK-NEXT: addl %ecx, %eax ## encoding: [0x01,0xc8]
	; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call i16 @llvm.x86.avx512.ptestm.w.256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2)
	%res1 = call i16 @llvm.x86.avx512.ptestm.w.256(<16 x i16> %x0, <16 x i16> %x1, i16-1)
	%res2 = add i16 %res, %res1
	ret i16 %res2
	}

	declare i16 @llvm.x86.avx512.ptestnm.b.128(<16 x i8>, <16 x i8>, i16)

	define i16@test_int_x86_avx512_ptestnm_b_128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2) {
	; CHECK-LABEL: test_int_x86_avx512_ptestnm_b_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vptestnmb %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x08,0x26,0xc1]
	; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
	; CHECK-NEXT: vptestnmb %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x26,0xc1]
	; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
	; CHECK-NEXT: addl %ecx, %eax ## encoding: [0x01,0xc8]
	; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call i16 @llvm.x86.avx512.ptestnm.b.128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2)
	%res1 = call i16 @llvm.x86.avx512.ptestnm.b.128(<16 x i8> %x0, <16 x i8> %x1, i16-1)
	%res2 = add i16 %res, %res1
	ret i16 %res2
	}

	declare i32 @llvm.x86.avx512.ptestnm.b.256(<32 x i8>, <32 x i8>, i32)

	define i32@test_int_x86_avx512_ptestnm_b_256(<32 x i8> %x0, <32 x i8> %x1, i32 %x2) {
	; CHECK-LABEL: test_int_x86_avx512_ptestnm_b_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vptestnmb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x26,0xc1]
	; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
	; CHECK-NEXT: vptestnmb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x28,0x26,0xc1]
	; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
	; CHECK-NEXT: addl %ecx, %eax ## encoding: [0x01,0xc8]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call i32 @llvm.x86.avx512.ptestnm.b.256(<32 x i8> %x0, <32 x i8> %x1, i32 %x2)
	%res1 = call i32 @llvm.x86.avx512.ptestnm.b.256(<32 x i8> %x0, <32 x i8> %x1, i32-1)
	%res2 = add i32 %res, %res1
	ret i32 %res2
	}

	declare i8 @llvm.x86.avx512.ptestnm.w.128(<8 x i16>, <8 x i16>, i8 %x2)

	define i8@test_int_x86_avx512_ptestnm_w_128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2) {
	; CHECK-LABEL: test_int_x86_avx512_ptestnm_w_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vptestnmw %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x08,0x26,0xc1]
	; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
	; CHECK-NEXT: vptestnmw %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfe,0x09,0x26,0xc1]
	; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
	; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8]
	; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call i8 @llvm.x86.avx512.ptestnm.w.128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2)
	%res1 = call i8 @llvm.x86.avx512.ptestnm.w.128(<8 x i16> %x0, <8 x i16> %x1, i8-1)
	%res2 = add i8 %res, %res1
	ret i8 %res2
	}

	declare i16 @llvm.x86.avx512.ptestnm.w.256(<16 x i16>, <16 x i16>, i16 %x2)

	define i16@test_int_x86_avx512_ptestnm_w_256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2) {
	; CHECK-LABEL: test_int_x86_avx512_ptestnm_w_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
	; CHECK-NEXT: vptestnmw %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x28,0x26,0xc1]
	; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
	; CHECK-NEXT: vptestnmw %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfe,0x29,0x26,0xc1]
	; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
	; CHECK-NEXT: addl %ecx, %eax ## encoding: [0x01,0xc8]
	; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call i16 @llvm.x86.avx512.ptestnm.w.256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2)
	%res1 = call i16 @llvm.x86.avx512.ptestnm.w.256(<16 x i16> %x0, <16 x i16> %x1, i16-1)
	%res2 = add i16 %res, %res1
	ret i16 %res2
	}

	declare <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8, <32 x i8>, i32)

	define <32 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_256(i8 %x0, <32 x i8> %x1, i32 %mask) {
	; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	-; CHECK-NEXT: vpbroadcastb %dil, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x7a,0xc7]
	-; CHECK-NEXT: vpbroadcastb %dil, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7a,0xcf]
	-; CHECK-NEXT: vpbroadcastb %dil, %ymm2 ## encoding: [0x62,0xf2,0x7d,0x28,0x7a,0xd7]
	+; CHECK-NEXT: vpbroadcastb %edi, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7a,0xcf]
	+; CHECK-NEXT: vpbroadcastb %edi, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x7a,0xc7]
	+; CHECK-NEXT: vpbroadcastb %edi, %ymm2 ## encoding: [0x62,0xf2,0x7d,0x28,0x7a,0xd7]
	; CHECK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc0]
	; CHECK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8 %x0, <32 x i8> %x1, i32 -1)
	%res1 = call <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8 %x0, <32 x i8> %x1, i32 %mask)
	%res2 = call <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8 %x0, <32 x i8> zeroinitializer, i32 %mask)
	%res3 = add <32 x i8> %res, %res1
	%res4 = add <32 x i8> %res2, %res3
	ret <32 x i8> %res4
	}

	declare <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8, <16 x i8>, i16)

	define <16 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_128(i8 %x0, <16 x i8> %x1, i16 %mask) {
	; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	-; CHECK-NEXT: vpbroadcastb %dil, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7a,0xcf]
	-; CHECK-NEXT: vpbroadcastb %dil, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7a,0xc7]
	-; CHECK-NEXT: vpbroadcastb %dil, %xmm2 ## encoding: [0x62,0xf2,0x7d,0x08,0x7a,0xd7]
	+; CHECK-NEXT: vpbroadcastb %edi, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7a,0xcf]
	+; CHECK-NEXT: vpbroadcastb %edi, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7a,0xc7]
	+; CHECK-NEXT: vpbroadcastb %edi, %xmm2 ## encoding: [0x62,0xf2,0x7d,0x08,0x7a,0xd7]
	; CHECK-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0]
	; CHECK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8 %x0, <16 x i8> %x1, i16 -1)
	%res1 = call <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8 %x0, <16 x i8> %x1, i16 %mask)
	%res2 = call <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8 %x0, <16 x i8> zeroinitializer, i16 %mask)
	%res3 = add <16 x i8> %res, %res1
	%res4 = add <16 x i8> %res2, %res3
	ret <16 x i8> %res4
	}

	declare <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16, <16 x i16>, i16)

	define <16 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_256(i16 %x0, <16 x i16> %x1, i16 %mask) {
	; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_256:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	-; CHECK-NEXT: vpbroadcastw %di, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7b,0xcf]
	-; CHECK-NEXT: vpbroadcastw %di, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x7b,0xc7]
	-; CHECK-NEXT: vpbroadcastw %di, %ymm2 ## encoding: [0x62,0xf2,0x7d,0x28,0x7b,0xd7]
	+; CHECK-NEXT: vpbroadcastw %edi, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7b,0xcf]
	+; CHECK-NEXT: vpbroadcastw %edi, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x7b,0xc7]
	+; CHECK-NEXT: vpbroadcastw %edi, %ymm2 ## encoding: [0x62,0xf2,0x7d,0x28,0x7b,0xd7]
	; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
	; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16 %x0, <16 x i16> %x1, i16 -1)
	%res1 = call <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16 %x0, <16 x i16> %x1, i16 %mask)
	%res2 = call <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16 %x0, <16 x i16> zeroinitializer, i16 %mask)
	%res3 = add <16 x i16> %res, %res1
	%res4 = add <16 x i16> %res2, %res3
	ret <16 x i16> %res4
	}

	declare <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16, <8 x i16>, i8)

	define <8 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_128(i16 %x0, <8 x i16> %x1, i8 %mask) {
	; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_128:
	; CHECK: ## BB#0:
	; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
	-; CHECK-NEXT: vpbroadcastw %di, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7b,0xcf]
	-; CHECK-NEXT: vpbroadcastw %di, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7b,0xc7]
	-; CHECK-NEXT: vpbroadcastw %di, %xmm2 ## encoding: [0x62,0xf2,0x7d,0x08,0x7b,0xd7]
	+; CHECK-NEXT: vpbroadcastw %edi, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7b,0xcf]
	+; CHECK-NEXT: vpbroadcastw %edi, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7b,0xc7]
	+; CHECK-NEXT: vpbroadcastw %edi, %xmm2 ## encoding: [0x62,0xf2,0x7d,0x08,0x7b,0xd7]
	; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
	; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
	; CHECK-NEXT: retq ## encoding: [0xc3]
	%res = call <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16 %x0, <8 x i16> %x1, i8 -1)
	%res1 = call <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16 %x0, <8 x i16> %x1, i8 %mask)
	%res2 = call <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16 %x0, <8 x i16> zeroinitializer, i8 %mask)
	%res3 = add <8 x i16> %res, %res1
	%res4 = add <8 x i16> %res2, %res3
	ret <8 x i16> %res4
	}
	diff --git a/test/CodeGen/X86/pr33349.ll b/test/CodeGen/X86/pr33349.ll
	new file mode 100644
	index 000000000000..db866db22481
	--- /dev/null
	+++ b/test/CodeGen/X86/pr33349.ll
	@@ -0,0 +1,92 @@
	+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	+; RUN: llc < %s -mattr=+avx512f \| FileCheck %s --check-prefix=KNL
	+; RUN: llc < %s -mattr=+avx512f,+avx512vl,+avx512bw,+avx512dq \| FileCheck %s --check-prefix=SKX
	+
	+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
	+target triple = "x86_64-unknown-linux-gnu"
	+
	+ define void @test(<4 x i1> %m, <4 x x86_fp80> %v, <4 x x86_fp80>*%p) local_unnamed_addr {
	+; KNL-LABEL: test:
	+; KNL: # BB#0: # %bb
	+; KNL-NEXT: vpextrb $0, %xmm0, %eax
	+; KNL-NEXT: testb $1, %al
	+; KNL-NEXT: fld1
	+; KNL-NEXT: fldz
	+; KNL-NEXT: fld %st(0)
	+; KNL-NEXT: fcmovne %st(2), %st(0)
	+; KNL-NEXT: vpextrb $4, %xmm0, %eax
	+; KNL-NEXT: testb $1, %al
	+; KNL-NEXT: fld %st(1)
	+; KNL-NEXT: fcmovne %st(3), %st(0)
	+; KNL-NEXT: vpextrb $8, %xmm0, %eax
	+; KNL-NEXT: testb $1, %al
	+; KNL-NEXT: fld %st(2)
	+; KNL-NEXT: fcmovne %st(4), %st(0)
	+; KNL-NEXT: vpextrb $12, %xmm0, %eax
	+; KNL-NEXT: testb $1, %al
	+; KNL-NEXT: fxch %st(3)
	+; KNL-NEXT: fcmovne %st(4), %st(0)
	+; KNL-NEXT: fstp %st(4)
	+; KNL-NEXT: fxch %st(3)
	+; KNL-NEXT: fstpt 30(%rdi)
	+; KNL-NEXT: fxch %st(1)
	+; KNL-NEXT: fstpt 20(%rdi)
	+; KNL-NEXT: fxch %st(1)
	+; KNL-NEXT: fstpt 10(%rdi)
	+; KNL-NEXT: fstpt (%rdi)
	+; KNL-NEXT: retq
	+;
	+; SKX-LABEL: test:
	+; SKX: # BB#0: # %bb
	+; SKX-NEXT: vpslld $31, %xmm0, %xmm0
	+; SKX-NEXT: vptestmd %xmm0, %xmm0, %k0
	+; SKX-NEXT: kshiftrw $2, %k0, %k1
	+; SKX-NEXT: kshiftlw $15, %k1, %k2
	+; SKX-NEXT: kshiftrw $15, %k2, %k2
	+; SKX-NEXT: kshiftlw $15, %k2, %k2
	+; SKX-NEXT: kshiftrw $15, %k2, %k2
	+; SKX-NEXT: kmovd %k2, %eax
	+; SKX-NEXT: testb $1, %al
	+; SKX-NEXT: fld1
	+; SKX-NEXT: fldz
	+; SKX-NEXT: fld %st(0)
	+; SKX-NEXT: fcmovne %st(2), %st(0)
	+; SKX-NEXT: kshiftlw $14, %k1, %k1
	+; SKX-NEXT: kshiftrw $15, %k1, %k1
	+; SKX-NEXT: kshiftlw $15, %k1, %k1
	+; SKX-NEXT: kshiftrw $15, %k1, %k1
	+; SKX-NEXT: kmovd %k1, %eax
	+; SKX-NEXT: testb $1, %al
	+; SKX-NEXT: fld %st(1)
	+; SKX-NEXT: fcmovne %st(3), %st(0)
	+; SKX-NEXT: kshiftlw $15, %k0, %k1
	+; SKX-NEXT: kshiftrw $15, %k1, %k1
	+; SKX-NEXT: kshiftlw $15, %k1, %k1
	+; SKX-NEXT: kshiftrw $15, %k1, %k1
	+; SKX-NEXT: kmovd %k1, %eax
	+; SKX-NEXT: testb $1, %al
	+; SKX-NEXT: fld %st(2)
	+; SKX-NEXT: fcmovne %st(4), %st(0)
	+; SKX-NEXT: kshiftlw $14, %k0, %k0
	+; SKX-NEXT: kshiftrw $15, %k0, %k0
	+; SKX-NEXT: kshiftlw $15, %k0, %k0
	+; SKX-NEXT: kshiftrw $15, %k0, %k0
	+; SKX-NEXT: kmovd %k0, %eax
	+; SKX-NEXT: testb $1, %al
	+; SKX-NEXT: fxch %st(3)
	+; SKX-NEXT: fcmovne %st(4), %st(0)
	+; SKX-NEXT: fstp %st(4)
	+; SKX-NEXT: fxch %st(3)
	+; SKX-NEXT: fstpt 10(%rdi)
	+; SKX-NEXT: fxch %st(1)
	+; SKX-NEXT: fstpt (%rdi)
	+; SKX-NEXT: fxch %st(1)
	+; SKX-NEXT: fstpt 30(%rdi)
	+; SKX-NEXT: fstpt 20(%rdi)
	+; SKX-NEXT: retq
	+ bb:
	+ %tmp = select <4 x i1> %m, <4 x x86_fp80> <x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK3FFF8000000000000000>, <4 x x86_fp80> zeroinitializer
	+ store <4 x x86_fp80> %tmp, <4 x x86_fp80>* %p, align 16
	+ ret void
	+ }
	+
	diff --git a/test/CodeGen/X86/pr34088.ll b/test/CodeGen/X86/pr34088.ll
	new file mode 100644
	index 000000000000..d3667e3884d4
	--- /dev/null
	+++ b/test/CodeGen/X86/pr34088.ll
	@@ -0,0 +1,46 @@
	+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	+; RUN: llc < %s -mtriple=i686-unknown -mcpu=pentium4 \| FileCheck %s
	+
	+%struct.Foo = type { i32, %struct.Bar }
	+%struct.Bar = type { i32, %struct.Buffer, i32 }
	+%struct.Buffer = type { i8*, i32 }
	+
	+; This test checks that the load of store %2 is not dropped.
	+;
	+define i32 @pr34088() local_unnamed_addr {
	+; CHECK-LABEL: pr34088:
	+; CHECK: # BB#0: # %entry
	+; CHECK-NEXT: pushl %ebp
	+; CHECK-NEXT: .Lcfi0:
	+; CHECK-NEXT: .cfi_def_cfa_offset 8
	+; CHECK-NEXT: .Lcfi1:
	+; CHECK-NEXT: .cfi_offset %ebp, -8
	+; CHECK-NEXT: movl %esp, %ebp
	+; CHECK-NEXT: .Lcfi2:
	+; CHECK-NEXT: .cfi_def_cfa_register %ebp
	+; CHECK-NEXT: andl $-16, %esp
	+; CHECK-NEXT: subl $32, %esp
	+; CHECK-NEXT: xorps %xmm0, %xmm0
	+; CHECK-NEXT: movaps {{.*#+}} xmm1 = [205,205,205,205,205,205,205,205,205,205,205,205,205,205,205,205]
	+; CHECK-NEXT: xorl %eax, %eax
	+; CHECK-NEXT: movaps %xmm0, (%esp)
	+; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
	+; CHECK-NEXT: movaps %xmm1, (%esp)
	+; CHECK-NEXT: movl $-842150451, {{[0-9]+}}(%esp) # imm = 0xCDCDCDCD
	+; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%esp)
	+; CHECK-NEXT: movl %ebp, %esp
	+; CHECK-NEXT: popl %ebp
	+; CHECK-NEXT: retl
	+entry:
	+ %foo = alloca %struct.Foo, align 4
	+ %0 = bitcast %struct.Foo* %foo to i8*
	+ call void @llvm.memset.p0i8.i32(i8* nonnull %0, i8 0, i32 20, i32 4, i1 false)
	+ %buffer1 = getelementptr inbounds %struct.Foo, %struct.Foo* %foo, i32 0, i32 1, i32 1
	+ %1 = bitcast %struct.Buffer* %buffer1 to i64*
	+ %2 = load i64, i64* %1, align 4
	+ call void @llvm.memset.p0i8.i32(i8* nonnull %0, i8 -51, i32 20, i32 4, i1 false)
	+ store i64 %2, i64* %1, align 4
	+ ret i32 0
	+}
	+
	+declare void @llvm.memset.p0i8.i32(i8* nocapture writeonly, i8, i32, i32, i1)
	diff --git a/test/CodeGen/X86/select-mmx.ll b/test/CodeGen/X86/select-mmx.ll
	new file mode 100644
	index 000000000000..9e6382faaa59
	--- /dev/null
	+++ b/test/CodeGen/X86/select-mmx.ll
	@@ -0,0 +1,120 @@
	+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+mmx < %s \| FileCheck %s --check-prefix=X64
	+; RUN: llc -mtriple=i686-unknown-unknown -mattr=+mmx < %s \| FileCheck %s --check-prefix=I32
	+
	+
	+; From source: clang -02
	+;__m64 test47(int a)
	+;{
	+; __m64 x = (a)? (__m64)(7): (__m64)(0);
	+; return __builtin_ia32_psllw(x, x);
	+;}
	+
	+define i64 @test47(i64 %arg) {
	+;
	+; X64-LABEL: test47:
	+; X64: # BB#0:
	+; X64-NEXT: xorl %eax, %eax
	+; X64-NEXT: testq %rdi, %rdi
	+; X64-NEXT: movl $7, %ecx
	+; X64-NEXT: cmoveq %rcx, %rax
	+; X64-NEXT: movd %rax, %mm0
	+; X64-NEXT: psllw %mm0, %mm0
	+; X64-NEXT: movd %mm0, %rax
	+; X64-NEXT: retq
	+;
	+; I32-LABEL: test47:
	+; I32: # BB#0:
	+; I32-NEXT: pushl %ebp
	+; I32-NEXT: .Lcfi0:
	+; I32-NEXT: .cfi_def_cfa_offset 8
	+; I32-NEXT: .Lcfi1:
	+; I32-NEXT: .cfi_offset %ebp, -8
	+; I32-NEXT: movl %esp, %ebp
	+; I32-NEXT: .Lcfi2:
	+; I32-NEXT: .cfi_def_cfa_register %ebp
	+; I32-NEXT: andl $-8, %esp
	+; I32-NEXT: subl $16, %esp
	+; I32-NEXT: movl 8(%ebp), %eax
	+; I32-NEXT: orl 12(%ebp), %eax
	+; I32-NEXT: movl $7, %eax
	+; I32-NEXT: je .LBB0_2
	+; I32-NEXT: # BB#1:
	+; I32-NEXT: xorl %eax, %eax
	+; I32-NEXT: .LBB0_2:
	+; I32-NEXT: movl %eax, {{[0-9]+}}(%esp)
	+; I32-NEXT: movl $0, {{[0-9]+}}(%esp)
	+; I32-NEXT: movq {{[0-9]+}}(%esp), %mm0
	+; I32-NEXT: psllw %mm0, %mm0
	+; I32-NEXT: movq %mm0, (%esp)
	+; I32-NEXT: movl (%esp), %eax
	+; I32-NEXT: movl {{[0-9]+}}(%esp), %edx
	+; I32-NEXT: movl %ebp, %esp
	+; I32-NEXT: popl %ebp
	+; I32-NEXT: retl
	+ %cond = icmp eq i64 %arg, 0
	+ %slct = select i1 %cond, x86_mmx bitcast (i64 7 to x86_mmx), x86_mmx bitcast (i64 0 to x86_mmx)
	+ %psll = tail call x86_mmx @llvm.x86.mmx.psll.w(x86_mmx %slct, x86_mmx %slct)
	+ %retc = bitcast x86_mmx %psll to i64
	+ ret i64 %retc
	+}
	+
	+
	+; From source: clang -O2
	+;__m64 test49(int a, long long n, long long m)
	+;{
	+; __m64 x = (a)? (__m64)(n): (__m64)(m);
	+; return __builtin_ia32_psllw(x, x);
	+;}
	+
	+define i64 @test49(i64 %arg, i64 %x, i64 %y) {
	+;
	+; X64-LABEL: test49:
	+; X64: # BB#0:
	+; X64-NEXT: testq %rdi, %rdi
	+; X64-NEXT: cmovneq %rdx, %rsi
	+; X64-NEXT: movd %rsi, %mm0
	+; X64-NEXT: psllw %mm0, %mm0
	+; X64-NEXT: movd %mm0, %rax
	+; X64-NEXT: retq
	+;
	+; I32-LABEL: test49:
	+; I32: # BB#0:
	+; I32-NEXT: pushl %ebp
	+; I32-NEXT: .Lcfi3:
	+; I32-NEXT: .cfi_def_cfa_offset 8
	+; I32-NEXT: .Lcfi4:
	+; I32-NEXT: .cfi_offset %ebp, -8
	+; I32-NEXT: movl %esp, %ebp
	+; I32-NEXT: .Lcfi5:
	+; I32-NEXT: .cfi_def_cfa_register %ebp
	+; I32-NEXT: andl $-8, %esp
	+; I32-NEXT: subl $8, %esp
	+; I32-NEXT: movl 8(%ebp), %eax
	+; I32-NEXT: orl 12(%ebp), %eax
	+; I32-NEXT: je .LBB1_1
	+; I32-NEXT: # BB#2:
	+; I32-NEXT: leal 24(%ebp), %eax
	+; I32-NEXT: jmp .LBB1_3
	+; I32-NEXT: .LBB1_1:
	+; I32-NEXT: leal 16(%ebp), %eax
	+; I32-NEXT: .LBB1_3:
	+; I32-NEXT: movq (%eax), %mm0
	+; I32-NEXT: psllw %mm0, %mm0
	+; I32-NEXT: movq %mm0, (%esp)
	+; I32-NEXT: movl (%esp), %eax
	+; I32-NEXT: movl {{[0-9]+}}(%esp), %edx
	+; I32-NEXT: movl %ebp, %esp
	+; I32-NEXT: popl %ebp
	+; I32-NEXT: retl
	+ %cond = icmp eq i64 %arg, 0
	+ %xmmx = bitcast i64 %x to x86_mmx
	+ %ymmx = bitcast i64 %y to x86_mmx
	+ %slct = select i1 %cond, x86_mmx %xmmx, x86_mmx %ymmx
	+ %psll = tail call x86_mmx @llvm.x86.mmx.psll.w(x86_mmx %slct, x86_mmx %slct)
	+ %retc = bitcast x86_mmx %psll to i64
	+ ret i64 %retc
	+}
	+
	+declare x86_mmx @llvm.x86.mmx.psll.w(x86_mmx, x86_mmx)
	+
	diff --git a/test/CodeGen/X86/vector-shuffle-128-v16.ll b/test/CodeGen/X86/vector-shuffle-128-v16.ll
	index abba0ff87ace..9f1ed021992d 100644
	--- a/test/CodeGen/X86/vector-shuffle-128-v16.ll
	+++ b/test/CodeGen/X86/vector-shuffle-128-v16.ll
	@@ -1,1830 +1,1830 @@
	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc < %s -mtriple=x86_64-unknown-unknown \| FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
	; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 \| FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
	; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 \| FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
	; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx \| FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX1
	; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 \| FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2
	; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw \| FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2OR512VL --check-prefix=AVX512VL

	define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i8> %a, <16 x i8> %b) {
	; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; SSE2: # BB#0:
	; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: pxor %xmm1, %xmm1
	; SSSE3-NEXT: pshufb %xmm1, %xmm0
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; SSE41: # BB#0:
	; SSE41-NEXT: pxor %xmm1, %xmm1
	; SSE41-NEXT: pshufb %xmm1, %xmm0
	; SSE41-NEXT: retq
	;
	; AVX1-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpbroadcastb %xmm0, %xmm0
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <16 x i8> %shuffle
	}

	define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01(<16 x i8> %a, <16 x i8> %b) {
	; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01:
	; SSE2: # BB#0:
	; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,1,1,4,5,6,7]
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01:
	; SSE41: # BB#0:
	; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
	; SSE41-NEXT: retq
	;
	; AVX1OR2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01:
	; AVX1OR2: # BB#0:
	; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
	; AVX1OR2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
	; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,1,4,5,6,7]
	; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
	ret <16 x i8> %shuffle
	}

	define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08(<16 x i8> %a, <16 x i8> %b) {
	; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
	; SSE2: # BB#0:
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
	; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
	; SSE41: # BB#0:
	; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
	; SSE41-NEXT: retq
	;
	; AVX1OR2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
	; AVX1OR2: # BB#0:
	; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
	; AVX1OR2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,1,1,8,8,9,9,8,8,9,9,10,10,11,11]
	; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
	; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
	ret <16 x i8> %shuffle
	}

	define <16 x i8> @shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03(<16 x i8> %a, <16 x i8> %b) {
	; SSE-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03:
	; SSE: # BB#0:
	; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
	; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
	; SSE-NEXT: retq
	;
	; AVX-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03:
	; AVX: # BB#0:
	; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
	; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
	; AVX-NEXT: retq
	%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
	ret <16 x i8> %shuffle
	}

	define <16 x i8> @shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07(<16 x i8> %a, <16 x i8> %b) {
	; SSE-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07:
	; SSE: # BB#0:
	; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
	; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
	; SSE-NEXT: retq
	;
	; AVX-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07:
	; AVX: # BB#0:
	; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
	; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
	; AVX-NEXT: retq
	%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
	ret <16 x i8> %shuffle
	}

	define <16 x i8> @shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12(<16 x i8> %a, <16 x i8> %b) {
	; SSE2-LABEL: shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
	; SSE2: # BB#0:
	; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
	; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
	; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
	; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,6]
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
	; SSE41: # BB#0:
	; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12]
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
	; AVX: # BB#0:
	; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12]
	; AVX-NEXT: retq
	%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12>
	ret <16 x i8> %shuffle
	}

	define <16 x i8> @shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07(<16 x i8> %a, <16 x i8> %b) {
	; SSE-LABEL: shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07:
	; SSE: # BB#0:
	; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
	; SSE-NEXT: retq
	;
	; AVX-LABEL: shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07:
	; AVX: # BB#0:
	; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
	; AVX-NEXT: retq
	%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
	ret <16 x i8> %shuffle
	}

	define <16 x i8> @shuffle_v16i8_0101010101010101(<16 x i8> %a, <16 x i8> %b) {
	; SSE-LABEL: shuffle_v16i8_0101010101010101:
	; SSE: # BB#0:
	; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
	; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
	; SSE-NEXT: retq
	;
	; AVX1-LABEL: shuffle_v16i8_0101010101010101:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i8_0101010101010101:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpbroadcastw %xmm0, %xmm0
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
	ret <16 x i8> %shuffle
	}

	define <16 x i8> @shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23(<16 x i8> %a, <16 x i8> %b) {
	; SSE-LABEL: shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23:
	; SSE: # BB#0:
	; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
	; SSE-NEXT: retq
	;
	; AVX-LABEL: shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23:
	; AVX: # BB#0:
	; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
	; AVX-NEXT: retq
	%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
	ret <16 x i8> %shuffle
	}

	define <16 x i8> @shuffle_v16i8_08_24_09_25_10_26_11_27_12_28_13_29_14_30_15_31(<16 x i8> %a, <16 x i8> %b) {
	; SSE-LABEL: shuffle_v16i8_08_24_09_25_10_26_11_27_12_28_13_29_14_30_15_31:
	; SSE: # BB#0:
	; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
	; SSE-NEXT: retq
	;
	; AVX-LABEL: shuffle_v16i8_08_24_09_25_10_26_11_27_12_28_13_29_14_30_15_31:
	; AVX: # BB#0:
	; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
	; AVX-NEXT: retq
	%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
	ret <16 x i8> %shuffle
	}

	define <16 x i8> @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07(<16 x i8> %a, <16 x i8> %b) {
	; SSE2-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
	; SSE2: # BB#0:
	; SSE2-NEXT: pxor %xmm2, %xmm2
	; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7]
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
	; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
	; SSE2-NEXT: por %xmm2, %xmm0
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
	; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
	; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
	; SSSE3-NEXT: movdqa %xmm1, %xmm0
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
	; SSE41: # BB#0:
	; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
	; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
	; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
	; SSE41-NEXT: movdqa %xmm1, %xmm0
	; SSE41-NEXT: retq
	;
	; AVX1-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
	; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpbroadcastb %xmm1, %xmm1
	; AVX2OR512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 0, i32 16, i32 1, i32 16, i32 2, i32 16, i32 3, i32 16, i32 4, i32 16, i32 5, i32 16, i32 6, i32 16, i32 7>
	ret <16 x i8> %shuffle
	}

	define <16 x i8> @shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12(<16 x i8> %a, <16 x i8> %b) {
	; SSE2-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12:
	; SSE2: # BB#0:
	; SSE2-NEXT: pxor %xmm1, %xmm1
	; SSE2-NEXT: movdqa %xmm0, %xmm2
	; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
	; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
	; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
	; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
	; SSE2-NEXT: packuswb %xmm2, %xmm0
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12:
	; SSE41: # BB#0:
	; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12:
	; AVX: # BB#0:
	; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
	; AVX-NEXT: retq
	%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
	ret <16 x i8> %shuffle
	}

	define <16 x i8> @shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20(<16 x i8> %a, <16 x i8> %b) {
	; SSE2-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
	; SSE2: # BB#0:
	; SSE2-NEXT: pxor %xmm2, %xmm2
	; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
	; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
	; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
	; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
	; SSE2-NEXT: packuswb %xmm1, %xmm0
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
	; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
	; SSE41: # BB#0:
	; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
	; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9]
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
	; AVX: # BB#0:
	; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
	; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9]
	; AVX-NEXT: retq
	%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 19, i32 18, i32 17, i32 16, i32 23, i32 22, i32 21, i32 20>
	ret <16 x i8> %shuffle
	}

	define <16 x i8> @shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20(<16 x i8> %a, <16 x i8> %b) {
	; SSE2-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
	; SSE2: # BB#0:
	; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
	; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
	; SSE2-NEXT: pxor %xmm1, %xmm1
	; SSE2-NEXT: movdqa %xmm0, %xmm2
	; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[3,2,1,0,4,5,6,7]
	; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
	; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
	; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
	; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
	; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
	; SSE2-NEXT: packuswb %xmm3, %xmm0
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[15,14,13,12,7,6,5,4,u,u,u,u,u,u,u,u]
	; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,11,10,9,8,u,u,u,u,u,u,u,u]
	; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
	; SSE41: # BB#0:
	; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[15,14,13,12,7,6,5,4,u,u,u,u,u,u,u,u]
	; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,11,10,9,8,u,u,u,u,u,u,u,u]
	; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
	; AVX: # BB#0:
	; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,14,13,12,7,6,5,4,u,u,u,u,u,u,u,u]
	; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,11,10,9,8,u,u,u,u,u,u,u,u]
	; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
	; AVX-NEXT: retq
	%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 31, i32 30, i32 29, i32 28, i32 11, i32 10, i32 9, i32 8, i32 23, i32 22, i32 21, i32 20>
	ret <16 x i8> %shuffle
	}

	define <16 x i8> @shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31(<16 x i8> %a, <16 x i8> %b) {
	; SSE2-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
	; SSE2: # BB#0:
	; SSE2-NEXT: movaps {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
	; SSE2-NEXT: andps %xmm2, %xmm0
	; SSE2-NEXT: andnps %xmm1, %xmm2
	; SSE2-NEXT: orps %xmm2, %xmm0
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
	; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
	; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
	; SSE41: # BB#0:
	; SSE41-NEXT: movdqa %xmm0, %xmm2
	; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
	; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
	; SSE41-NEXT: movdqa %xmm1, %xmm0
	; SSE41-NEXT: retq
	;
	; AVX1OR2-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
	; AVX1OR2: # BB#0:
	; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
	; AVX1OR2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
	; AVX1OR2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: movw $-21846, %ax # imm = 0xAAAA
	; AVX512VL-NEXT: kmovd %eax, %k1
	; AVX512VL-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
	ret <16 x i8> %shuffle
	}

	define <16 x i8> @shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31(<16 x i8> %a, <16 x i8> %b) {
	; SSE2-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31:
	; SSE2: # BB#0:
	; SSE2-NEXT: movaps {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
	; SSE2-NEXT: andps %xmm2, %xmm0
	; SSE2-NEXT: andnps %xmm1, %xmm2
	; SSE2-NEXT: orps %xmm2, %xmm0
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[15]
	; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2],zero,xmm0[4,5,6],zero,xmm0[8,9,10],zero,xmm0[12,13,14],zero
	; SSSE3-NEXT: por %xmm1, %xmm0
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31:
	; SSE41: # BB#0:
	; SSE41-NEXT: movdqa %xmm0, %xmm2
	; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
	; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
	; SSE41-NEXT: movdqa %xmm1, %xmm0
	; SSE41-NEXT: retq
	;
	; AVX1OR2-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31:
	; AVX1OR2: # BB#0:
	; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
	; AVX1OR2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
	; AVX1OR2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: movw $-30584, %ax # imm = 0x8888
	; AVX512VL-NEXT: kmovd %eax, %k1
	; AVX512VL-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 4, i32 5, i32 6, i32 23, i32 8, i32 9, i32 10, i32 27, i32 12, i32 13, i32 14, i32 31>
	ret <16 x i8> %shuffle
	}

	define <16 x i8> @shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz(<16 x i8> %a) {
	; SSE-LABEL: shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz:
	; SSE: # BB#0:
	; SSE-NEXT: andps {{.*}}(%rip), %xmm0
	; SSE-NEXT: retq
	;
	; AVX1OR2-LABEL: shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz:
	; AVX1OR2: # BB#0:
	; AVX1OR2-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
	; AVX1OR2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 4, i32 5, i32 6, i32 23, i32 8, i32 9, i32 10, i32 27, i32 12, i32 13, i32 14, i32 31>
	ret <16 x i8> %shuffle
	}

	define <16 x i8> @shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31(<16 x i8> %a, <16 x i8> %b) {
	; SSE2-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
	; SSE2: # BB#0:
	; SSE2-NEXT: movaps {{.*#+}} xmm2 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0]
	; SSE2-NEXT: andps %xmm2, %xmm0
	; SSE2-NEXT: andnps %xmm1, %xmm2
	; SSE2-NEXT: orps %xmm2, %xmm0
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[4],zero,zero,xmm1[7],zero,zero,zero,zero,xmm1[12],zero,zero,xmm1[15]
	; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,xmm0[5,6],zero,xmm0[8,9,10,11],zero,xmm0[13,14],zero
	; SSSE3-NEXT: por %xmm1, %xmm0
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
	; SSE41: # BB#0:
	; SSE41-NEXT: movdqa %xmm0, %xmm2
	; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0]
	; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
	; SSE41-NEXT: movdqa %xmm1, %xmm0
	; SSE41-NEXT: retq
	;
	; AVX1OR2-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
	; AVX1OR2: # BB#0:
	; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0]
	; AVX1OR2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
	; AVX1OR2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: movw $-28528, %ax # imm = 0x9090
	; AVX512VL-NEXT: kmovd %eax, %k1
	; AVX512VL-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 5, i32 6, i32 23, i32 8, i32 9, i32 10, i32 11, i32 28, i32 13, i32 14, i32 31>
	ret <16 x i8> %shuffle
	}

	define <16 x i8> @shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15(<16 x i8> %a, <16 x i8> %b) {
	; SSE2-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15:
	; SSE2: # BB#0:
	; SSE2-NEXT: movaps {{.*#+}} xmm2 = [255,255,255,255,0,0,0,0,255,255,0,0,255,0,255,0]
	; SSE2-NEXT: andps %xmm2, %xmm1
	; SSE2-NEXT: andnps %xmm0, %xmm2
	; SSE2-NEXT: orps %xmm1, %xmm2
	; SSE2-NEXT: movaps %xmm2, %xmm0
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[4,5,6,7],zero,zero,xmm0[10,11],zero,xmm0[13],zero,xmm0[15]
	; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,2,3],zero,zero,zero,zero,xmm1[8,9],zero,zero,xmm1[12],zero,xmm1[14],zero
	; SSSE3-NEXT: por %xmm1, %xmm0
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15:
	; SSE41: # BB#0:
	; SSE41-NEXT: movdqa %xmm0, %xmm2
	; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,255,255,255,0,0,0,0,255,255,0,0,255,0,255,0]
	; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm2
	; SSE41-NEXT: movdqa %xmm2, %xmm0
	; SSE41-NEXT: retq
	;
	; AVX1OR2-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15:
	; AVX1OR2: # BB#0:
	; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,0,0,0,0,255,255,0,0,255,0,255,0]
	; AVX1OR2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
	; AVX1OR2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: movw $-21264, %ax # imm = 0xACF0
	; AVX512VL-NEXT: kmovd %eax, %k1
	; AVX512VL-NEXT: vpblendmb %xmm0, %xmm1, %xmm0 {%k1}
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 10, i32 11, i32 28, i32 13, i32 30, i32 15>
	ret <16 x i8> %shuffle
	}

	define <16 x i8> @trunc_v4i32_shuffle(<16 x i8> %a) {
	; SSE2-LABEL: trunc_v4i32_shuffle:
	; SSE2: # BB#0:
	; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
	; SSE2-NEXT: packuswb %xmm0, %xmm0
	; SSE2-NEXT: packuswb %xmm0, %xmm0
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: trunc_v4i32_shuffle:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: trunc_v4i32_shuffle:
	; SSE41: # BB#0:
	; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: trunc_v4i32_shuffle:
	; AVX: # BB#0:
	; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
	; AVX-NEXT: retq
	%shuffle = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
	ret <16 x i8> %shuffle
	}

	define <16 x i8> @stress_test0(<16 x i8> %s.0.1, <16 x i8> %s.0.2, <16 x i8> %s.0.3, <16 x i8> %s.0.4, <16 x i8> %s.0.5, <16 x i8> %s.0.6, <16 x i8> %s.0.7, <16 x i8> %s.0.8, <16 x i8> %s.0.9) {
	; We don't have anything useful to check here. This generates 100s of
	; instructions. Instead, just make sure we survived codegen.
	; ALL-LABEL: stress_test0:
	; ALL: retq
	entry:
	%s.1.4 = shufflevector <16 x i8> %s.0.4, <16 x i8> %s.0.5, <16 x i32> <i32 1, i32 22, i32 21, i32 28, i32 3, i32 16, i32 6, i32 1, i32 19, i32 29, i32 12, i32 31, i32 2, i32 3, i32 3, i32 6>
	%s.1.5 = shufflevector <16 x i8> %s.0.5, <16 x i8> %s.0.6, <16 x i32> <i32 31, i32 20, i32 12, i32 19, i32 2, i32 15, i32 12, i32 31, i32 2, i32 28, i32 2, i32 30, i32 7, i32 8, i32 17, i32 28>
	%s.1.8 = shufflevector <16 x i8> %s.0.8, <16 x i8> %s.0.9, <16 x i32> <i32 14, i32 10, i32 17, i32 5, i32 17, i32 9, i32 17, i32 21, i32 31, i32 24, i32 16, i32 6, i32 20, i32 28, i32 23, i32 8>
	%s.2.2 = shufflevector <16 x i8> %s.0.3, <16 x i8> %s.0.4, <16 x i32> <i32 20, i32 9, i32 21, i32 11, i32 11, i32 4, i32 3, i32 18, i32 3, i32 30, i32 4, i32 31, i32 11, i32 24, i32 13, i32 29>
	%s.3.2 = shufflevector <16 x i8> %s.2.2, <16 x i8> %s.1.4, <16 x i32> <i32 15, i32 13, i32 5, i32 11, i32 7, i32 17, i32 14, i32 22, i32 22, i32 16, i32 7, i32 24, i32 16, i32 22, i32 7, i32 29>
	%s.5.4 = shufflevector <16 x i8> %s.1.5, <16 x i8> %s.1.8, <16 x i32> <i32 3, i32 13, i32 19, i32 7, i32 23, i32 11, i32 1, i32 9, i32 16, i32 25, i32 2, i32 7, i32 0, i32 21, i32 23, i32 17>
	%s.6.1 = shufflevector <16 x i8> %s.3.2, <16 x i8> %s.3.2, <16 x i32> <i32 11, i32 2, i32 28, i32 31, i32 27, i32 3, i32 9, i32 27, i32 25, i32 25, i32 14, i32 7, i32 12, i32 28, i32 12, i32 23>
	%s.7.1 = shufflevector <16 x i8> %s.6.1, <16 x i8> %s.3.2, <16 x i32> <i32 15, i32 29, i32 14, i32 0, i32 29, i32 15, i32 26, i32 30, i32 6, i32 7, i32 2, i32 8, i32 12, i32 10, i32 29, i32 17>
	%s.7.2 = shufflevector <16 x i8> %s.3.2, <16 x i8> %s.5.4, <16 x i32> <i32 3, i32 29, i32 3, i32 19, i32 undef, i32 20, i32 undef, i32 3, i32 27, i32 undef, i32 undef, i32 11, i32 undef, i32 undef, i32 undef, i32 undef>
	%s.16.0 = shufflevector <16 x i8> %s.7.1, <16 x i8> %s.7.2, <16 x i32> <i32 13, i32 1, i32 16, i32 16, i32 6, i32 7, i32 29, i32 18, i32 19, i32 28, i32 undef, i32 undef, i32 31, i32 1, i32 undef, i32 10>
	ret <16 x i8> %s.16.0
	}

	define <16 x i8> @undef_test1(<16 x i8> %s.0.5, <16 x i8> %s.0.8, <16 x i8> %s.0.9) noinline nounwind {
	; There is nothing interesting to check about these instructions other than
	; that they survive codegen. However, we actually do better and delete all of
	; them because the result is 'undef'.
	;
	; ALL-LABEL: undef_test1:
	; ALL: # BB#0: # %entry
	; ALL-NEXT: retq
	entry:
	%s.1.8 = shufflevector <16 x i8> %s.0.8, <16 x i8> undef, <16 x i32> <i32 9, i32 9, i32 undef, i32 undef, i32 undef, i32 2, i32 undef, i32 6, i32 undef, i32 6, i32 undef, i32 14, i32 14, i32 undef, i32 undef, i32 0>
	%s.2.4 = shufflevector <16 x i8> undef, <16 x i8> %s.0.5, <16 x i32> <i32 21, i32 undef, i32 undef, i32 19, i32 undef, i32 undef, i32 29, i32 24, i32 21, i32 23, i32 21, i32 17, i32 19, i32 undef, i32 20, i32 22>
	%s.2.5 = shufflevector <16 x i8> %s.0.5, <16 x i8> undef, <16 x i32> <i32 3, i32 8, i32 undef, i32 7, i32 undef, i32 10, i32 8, i32 0, i32 15, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 9>
	%s.2.9 = shufflevector <16 x i8> %s.0.9, <16 x i8> undef, <16 x i32> <i32 7, i32 undef, i32 14, i32 7, i32 8, i32 undef, i32 7, i32 8, i32 5, i32 15, i32 undef, i32 1, i32 11, i32 undef, i32 undef, i32 11>
	%s.3.4 = shufflevector <16 x i8> %s.2.4, <16 x i8> %s.0.5, <16 x i32> <i32 5, i32 0, i32 21, i32 6, i32 15, i32 27, i32 22, i32 21, i32 4, i32 22, i32 19, i32 26, i32 9, i32 26, i32 8, i32 29>
	%s.3.9 = shufflevector <16 x i8> %s.2.9, <16 x i8> undef, <16 x i32> <i32 8, i32 6, i32 8, i32 1, i32 undef, i32 4, i32 undef, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 6, i32 undef>
	%s.4.7 = shufflevector <16 x i8> %s.1.8, <16 x i8> %s.2.9, <16 x i32> <i32 9, i32 0, i32 22, i32 20, i32 24, i32 7, i32 21, i32 17, i32 20, i32 12, i32 19, i32 23, i32 2, i32 9, i32 17, i32 10>
	%s.4.8 = shufflevector <16 x i8> %s.2.9, <16 x i8> %s.3.9, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 6, i32 10, i32 undef, i32 0, i32 5, i32 undef, i32 9, i32 undef>
	%s.5.7 = shufflevector <16 x i8> %s.4.7, <16 x i8> %s.4.8, <16 x i32> <i32 16, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
	%s.8.4 = shufflevector <16 x i8> %s.3.4, <16 x i8> %s.5.7, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 28, i32 undef, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
	%s.9.4 = shufflevector <16 x i8> %s.8.4, <16 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 10, i32 5>
	%s.10.4 = shufflevector <16 x i8> %s.9.4, <16 x i8> undef, <16 x i32> <i32 undef, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
	%s.12.4 = shufflevector <16 x i8> %s.10.4, <16 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 13, i32 undef, i32 undef, i32 undef>

	ret <16 x i8> %s.12.4
	}

	define <16 x i8> @PR20540(<8 x i8> %a) {
	; SSE2-LABEL: PR20540:
	; SSE2: # BB#0:
	; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
	; SSE2-NEXT: packuswb %xmm0, %xmm0
	; SSE2-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: PR20540:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: PR20540:
	; SSE41: # BB#0:
	; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: PR20540:
	; AVX: # BB#0:
	; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
	ret <16 x i8> %shuffle
	}

	define <16 x i8> @shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) {
	; SSE-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
	; SSE: # BB#0:
	; SSE-NEXT: movzbl %dil, %eax
	; SSE-NEXT: movd %eax, %xmm0
	; SSE-NEXT: retq
	;
	; AVX-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
	; AVX: # BB#0:
	; AVX-NEXT: movzbl %dil, %eax
	; AVX-NEXT: vmovd %eax, %xmm0
	; AVX-NEXT: retq
	%a = insertelement <16 x i8> undef, i8 %i, i32 0
	%shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 16, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
	ret <16 x i8> %shuffle
	}

	define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) {
	; SSE2-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
	; SSE2: # BB#0:
	; SSE2-NEXT: shll $8, %edi
	; SSE2-NEXT: pxor %xmm0, %xmm0
	; SSE2-NEXT: pinsrw $2, %edi, %xmm0
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: shll $8, %edi
	; SSSE3-NEXT: pxor %xmm0, %xmm0
	; SSSE3-NEXT: pinsrw $2, %edi, %xmm0
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
	; SSE41: # BB#0:
	; SSE41-NEXT: pxor %xmm0, %xmm0
	; SSE41-NEXT: pinsrb $5, %edi, %xmm0
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
	; AVX: # BB#0:
	; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
	; AVX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
	; AVX-NEXT: retq
	%a = insertelement <16 x i8> undef, i8 %i, i32 0
	%shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <16 x i8> %shuffle
	}

	define <16 x i8> @shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16(i8 %i) {
	; SSE2-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
	; SSE2: # BB#0:
	; SSE2-NEXT: shll $8, %edi
	; SSE2-NEXT: pxor %xmm0, %xmm0
	; SSE2-NEXT: pinsrw $7, %edi, %xmm0
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: shll $8, %edi
	; SSSE3-NEXT: pxor %xmm0, %xmm0
	; SSSE3-NEXT: pinsrw $7, %edi, %xmm0
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
	; SSE41: # BB#0:
	; SSE41-NEXT: pxor %xmm0, %xmm0
	; SSE41-NEXT: pinsrb $15, %edi, %xmm0
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
	; AVX: # BB#0:
	; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
	; AVX-NEXT: vpinsrb $15, %edi, %xmm0, %xmm0
	; AVX-NEXT: retq
	%a = insertelement <16 x i8> undef, i8 %i, i32 0
	%shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 16>
	ret <16 x i8> %shuffle
	}

	define <16 x i8> @shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) {
	; SSE2-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
	; SSE2: # BB#0:
	; SSE2-NEXT: movzbl %dil, %eax
	; SSE2-NEXT: pxor %xmm0, %xmm0
	; SSE2-NEXT: pinsrw $1, %eax, %xmm0
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: movzbl %dil, %eax
	; SSSE3-NEXT: pxor %xmm0, %xmm0
	; SSSE3-NEXT: pinsrw $1, %eax, %xmm0
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
	; SSE41: # BB#0:
	; SSE41-NEXT: pxor %xmm0, %xmm0
	; SSE41-NEXT: pinsrb $2, %edi, %xmm0
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
	; AVX: # BB#0:
	; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
	; AVX-NEXT: vpinsrb $2, %edi, %xmm0, %xmm0
	; AVX-NEXT: retq
	%a = insertelement <16 x i8> undef, i8 %i, i32 3
	%shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 19, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
	ret <16 x i8> %shuffle
	}

	define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu(<16 x i8> %a) {
	; SSE-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu:
	; SSE: # BB#0:
	; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
	; SSE-NEXT: retq
	;
	; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu:
	; AVX: # BB#0:
	; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
	; AVX-NEXT: retq
	%shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 undef, i32 18, i32 undef>
	ret <16 x i8> %shuffle
	}

	define <16 x i8> @shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(<16 x i8> %a) {
	; SSE-LABEL: shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
	; SSE: # BB#0:
	; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
	; SSE-NEXT: retq
	;
	; AVX-LABEL: shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
	; AVX: # BB#0:
	; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
	; AVX-NEXT: retq
	%shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 28, i32 undef, i32 30, i32 31, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 09, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <16 x i8> %shuffle
	}

	define <16 x i8> @shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14(<16 x i8> %a, <16 x i8> %b) {
	; SSE2-LABEL: shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
	; SSE2: # BB#0:
	; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
	; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
	; SSE2-NEXT: por %xmm1, %xmm0
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
	; SSE41: # BB#0:
	; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
	; AVX: # BB#0:
	; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
	; AVX-NEXT: retq
	%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
	ret <16 x i8> %shuffle
	}

	define <16 x i8> @shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14(<16 x i8> %a, <16 x i8> %b) {
	; SSE2-LABEL: shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
	; SSE2: # BB#0:
	; SSE2-NEXT: movdqa %xmm0, %xmm1
	; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
	; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
	; SSE2-NEXT: por %xmm1, %xmm0
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
	; SSE41: # BB#0:
	; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
	; AVX: # BB#0:
	; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
	; AVX-NEXT: retq
	%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
	ret <16 x i8> %shuffle
	}

	define <16 x i8> @shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00(<16 x i8> %a, <16 x i8> %b) {
	; SSE2-LABEL: shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00:
	; SSE2: # BB#0:
	; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
	; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
	; SSE2-NEXT: por %xmm1, %xmm0
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00:
	; SSE41: # BB#0:
	; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00:
	; AVX: # BB#0:
	; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
	; AVX-NEXT: retq
	%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0>
	ret <16 x i8> %shuffle
	}

	define <16 x i8> @shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16(<16 x i8> %a, <16 x i8> %b) {
	; SSE2-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16:
	; SSE2: # BB#0:
	; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
	; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
	; SSE2-NEXT: por %xmm1, %xmm0
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
	; SSSE3-NEXT: movdqa %xmm1, %xmm0
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16:
	; SSE41: # BB#0:
	; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
	; SSE41-NEXT: movdqa %xmm1, %xmm0
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16:
	; AVX: # BB#0:
	; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
	; AVX-NEXT: retq
	%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
	ret <16 x i8> %shuffle
	}

	define <16 x i8> @shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00(<16 x i8> %a, <16 x i8> %b) {
	; SSE2-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00:
	; SSE2: # BB#0:
	; SSE2-NEXT: movdqa %xmm0, %xmm1
	; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
	; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
	; SSE2-NEXT: por %xmm1, %xmm0
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00:
	; SSE41: # BB#0:
	; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0]
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00:
	; AVX: # BB#0:
	; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0]
	; AVX-NEXT: retq
	%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0>
	ret <16 x i8> %shuffle
	}

	define <16 x i8> @shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30(<16 x i8> %a, <16 x i8> %b) {
	; SSE2-LABEL: shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
	; SSE2: # BB#0:
	; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
	; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
	; SSE2-NEXT: por %xmm1, %xmm0
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
	; SSSE3-NEXT: movdqa %xmm1, %xmm0
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
	; SSE41: # BB#0:
	; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
	; SSE41-NEXT: movdqa %xmm1, %xmm0
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
	; AVX: # BB#0:
	; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
	; AVX-NEXT: retq
	%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
	ret <16 x i8> %shuffle
	}

	; PR31151
	define <16 x i8> @shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23(<16 x i8> %val1, <16 x i8> %val2) {
	; SSE2-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23:
	; SSE2: # BB#0:
	; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
	; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,1,3]
	; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
	; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
	; SSE2-NEXT: pand %xmm1, %xmm0
	; SSE2-NEXT: pandn %xmm2, %xmm1
	; SSE2-NEXT: por %xmm0, %xmm1
	; SSE2-NEXT: movdqa %xmm1, %xmm0
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
	; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23:
	; SSE41: # BB#0:
	; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
	; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23:
	; AVX: # BB#0:
	; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
	; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
	; AVX-NEXT: retq
	%shuffle = shufflevector <16 x i8> %val1, <16 x i8> %val2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23>
	ret <16 x i8> %shuffle
	}

	define <16 x i8> @shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu(<16 x i8> %a) {
	; SSE2-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu:
	; SSE2: # BB#0:
	; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
	; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu:
	; SSE41: # BB#0:
	; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu:
	; AVX: # BB#0:
	; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
	; AVX-NEXT: retq
	%shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
	ret <16 x i8> %shuffle
	}

	define <16 x i8> @shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz(<16 x i8> %a) {
	; SSE2-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz:
	; SSE2: # BB#0:
	; SSE2-NEXT: pxor %xmm1, %xmm1
	; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
	; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
	; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz:
	; SSE41: # BB#0:
	; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz:
	; AVX: # BB#0:
	; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
	; AVX-NEXT: retq
	%shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 1, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
	ret <16 x i8> %shuffle
	}

	define <16 x i8> @shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu(<16 x i8> %a) {
	; SSE2-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu:
	; SSE2: # BB#0:
	; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
	; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
	; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu:
	; SSE41: # BB#0:
	; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu:
	; AVX: # BB#0:
	; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
	; AVX-NEXT: retq
	%shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 2, i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 undef>
	ret <16 x i8> %shuffle
	}

	define <16 x i8> @shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz(<16 x i8> %a) {
	; SSE2-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz:
	; SSE2: # BB#0:
	; SSE2-NEXT: pxor %xmm1, %xmm1
	; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
	; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: pxor %xmm1, %xmm1
	; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
	; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz:
	; SSE41: # BB#0:
	; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz:
	; AVX: # BB#0:
	; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
	; AVX-NEXT: retq
	%shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 1, i32 21, i32 22, i32 23, i32 2, i32 25, i32 26, i32 27, i32 3, i32 29, i32 30, i32 31>
	ret <16 x i8> %shuffle
	}

	define <16 x i8> @shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu(<16 x i8> %a) {
	; SSE2-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu:
	; SSE2: # BB#0:
	; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu:
	; SSE41: # BB#0:
	; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu:
	; AVX: # BB#0:
	; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
	; AVX-NEXT: retq
	%shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 3, i32 undef, i32 4, i32 undef, i32 5, i32 undef, i32 6, i32 undef, i32 7, i32 undef>
	ret <16 x i8> %shuffle
	}

	define <16 x i8> @shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz(<16 x i8> %a) {
	; SSE2-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz:
	; SSE2: # BB#0:
	; SSE2-NEXT: pxor %xmm1, %xmm1
	; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: pxor %xmm1, %xmm1
	; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz:
	; SSE41: # BB#0:
	; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz:
	; AVX: # BB#0:
	; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
	; AVX-NEXT: retq
	%shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 1, i32 19, i32 2, i32 21, i32 3, i32 23, i32 4, i32 25, i32 5, i32 27, i32 6, i32 29, i32 7, i32 31>
	ret <16 x i8> %shuffle
	}

	define <16 x i8> @shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00(<16 x i8> %a, <16 x i8> %b) {
	; SSE2-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
	; SSE2: # BB#0: # %entry
	; SSE2-NEXT: pxor %xmm2, %xmm2
	; SSE2-NEXT: movdqa %xmm0, %xmm3
	; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
	; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,3,0,1]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,2,2,4,5,6,7]
	; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7]
	; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,0,0,65535]
	; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
	; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,3]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,1,3,4,5,6,7]
	; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,4]
	; SSE2-NEXT: pand %xmm5, %xmm2
	; SSE2-NEXT: pandn %xmm4, %xmm5
	; SSE2-NEXT: por %xmm2, %xmm5
	; SSE2-NEXT: psrlq $16, %xmm3
	; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,1,3]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
	; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,4]
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
	; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
	; SSE2-NEXT: packuswb %xmm5, %xmm2
	; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
	; SSE2-NEXT: pand %xmm0, %xmm2
	; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,3,1,1,4,5,6,7]
	; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
	; SSE2-NEXT: pandn %xmm1, %xmm0
	; SSE2-NEXT: por %xmm2, %xmm0
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
	; SSSE3: # BB#0: # %entry
	; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero
	; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0]
	; SSSE3-NEXT: por %xmm1, %xmm0
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
	; SSE41: # BB#0: # %entry
	; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero
	; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0]
	; SSE41-NEXT: por %xmm1, %xmm0
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
	; AVX: # BB#0: # %entry
	; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero
	; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0]
	; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
	; AVX-NEXT: retq
	entry:
	%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 undef, i32 10, i32 2, i32 7, i32 22, i32 14, i32 7, i32 2, i32 18, i32 3, i32 1, i32 14, i32 18, i32 9, i32 11, i32 0>

	ret <16 x i8> %shuffle
	}

	define <16 x i8> @stress_test2(<16 x i8> %s.0.0, <16 x i8> %s.0.1, <16 x i8> %s.0.2) {
	; Nothing interesting to test here. Just make sure we didn't crashe.
	; ALL-LABEL: stress_test2:
	; ALL: retq
	entry:
	%s.1.0 = shufflevector <16 x i8> %s.0.0, <16 x i8> %s.0.1, <16 x i32> <i32 29, i32 30, i32 2, i32 16, i32 26, i32 21, i32 11, i32 26, i32 26, i32 3, i32 4, i32 5, i32 30, i32 28, i32 15, i32 5>
	%s.1.1 = shufflevector <16 x i8> %s.0.1, <16 x i8> %s.0.2, <16 x i32> <i32 31, i32 1, i32 24, i32 12, i32 28, i32 5, i32 2, i32 9, i32 29, i32 1, i32 31, i32 5, i32 6, i32 17, i32 15, i32 22>
	%s.2.0 = shufflevector <16 x i8> %s.1.0, <16 x i8> %s.1.1, <16 x i32> <i32 22, i32 1, i32 12, i32 3, i32 30, i32 4, i32 30, i32 undef, i32 1, i32 10, i32 14, i32 18, i32 27, i32 13, i32 16, i32 19>

	ret <16 x i8> %s.2.0
	}

	define void @constant_gets_selected(<4 x i32>* %ptr1, <4 x i32>* %ptr2) {
	; SSE-LABEL: constant_gets_selected:
	; SSE: # BB#0: # %entry
	; SSE-NEXT: xorps %xmm0, %xmm0
	; SSE-NEXT: movaps %xmm0, (%rdi)
	; SSE-NEXT: movaps %xmm0, (%rsi)
	; SSE-NEXT: retq
	;
	; AVX1OR2-LABEL: constant_gets_selected:
	; AVX1OR2: # BB#0: # %entry
	; AVX1OR2-NEXT: vxorps %xmm0, %xmm0, %xmm0
	; AVX1OR2-NEXT: vmovaps %xmm0, (%rdi)
	; AVX1OR2-NEXT: vmovaps %xmm0, (%rsi)
	; AVX1OR2-NEXT: retq
	;
	; AVX512VL-LABEL: constant_gets_selected:
	; AVX512VL: # BB#0: # %entry
	; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0
	; AVX512VL-NEXT: vmovdqa %xmm0, (%rdi)
	; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
	; AVX512VL-NEXT: retq
	entry:
	%weird_zero = bitcast <4 x i32> zeroinitializer to <16 x i8>
	%shuffle.i = shufflevector <16 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0>, <16 x i8> %weird_zero, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27>
	%weirder_zero = bitcast <16 x i8> %shuffle.i to <4 x i32>
	store <4 x i32> %weirder_zero, <4 x i32>* %ptr1, align 16
	store <4 x i32> zeroinitializer, <4 x i32>* %ptr2, align 16
	ret void
	}

	;
	; Shuffle to logical bit shifts
	;

	define <16 x i8> @shuffle_v16i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14(<16 x i8> %a, <16 x i8> %b) {
	; SSE-LABEL: shuffle_v16i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14:
	; SSE: # BB#0:
	; SSE-NEXT: psllw $8, %xmm0
	; SSE-NEXT: retq
	;
	; AVX-LABEL: shuffle_v16i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14:
	; AVX: # BB#0:
	; AVX-NEXT: vpsllw $8, %xmm0, %xmm0
	; AVX-NEXT: retq
	%shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 0, i32 16, i32 2, i32 16, i32 4, i32 16, i32 6, i32 16, i32 8, i32 16, i32 10, i32 16, i32 12, i32 16, i32 14>
	ret <16 x i8> %shuffle
	}

	define <16 x i8> @shuffle_v16i8_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12(<16 x i8> %a, <16 x i8> %b) {
	; SSE-LABEL: shuffle_v16i8_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12:
	; SSE: # BB#0:
	; SSE-NEXT: pslld $24, %xmm0
	; SSE-NEXT: retq
	;
	; AVX-LABEL: shuffle_v16i8_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12:
	; AVX: # BB#0:
	; AVX-NEXT: vpslld $24, %xmm0, %xmm0
	; AVX-NEXT: retq
	%shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 16, i32 16, i32 0, i32 16, i32 16, i32 16, i32 4, i32 16, i32 16, i32 16, i32 8, i32 16, i32 16, i32 16, i32 12>
	ret <16 x i8> %shuffle
	}

	define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_00_zz_zz_zz_zz_zz_zz_zz_08(<16 x i8> %a, <16 x i8> %b) {
	; SSE-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_00_zz_zz_zz_zz_zz_zz_zz_08:
	; SSE: # BB#0:
	; SSE-NEXT: psllq $56, %xmm0
	; SSE-NEXT: retq
	;
	; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_00_zz_zz_zz_zz_zz_zz_zz_08:
	; AVX: # BB#0:
	; AVX-NEXT: vpsllq $56, %xmm0, %xmm0
	; AVX-NEXT: retq
	%shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 8>
	ret <16 x i8> %shuffle
	}

	define <16 x i8> @shuffle_v16i8_zz_00_uu_02_03_uu_05_06_zz_08_09_uu_11_12_13_14(<16 x i8> %a, <16 x i8> %b) {
	; SSE-LABEL: shuffle_v16i8_zz_00_uu_02_03_uu_05_06_zz_08_09_uu_11_12_13_14:
	; SSE: # BB#0:
	; SSE-NEXT: psllq $8, %xmm0
	; SSE-NEXT: retq
	;
	; AVX-LABEL: shuffle_v16i8_zz_00_uu_02_03_uu_05_06_zz_08_09_uu_11_12_13_14:
	; AVX: # BB#0:
	; AVX-NEXT: vpsllq $8, %xmm0, %xmm0
	; AVX-NEXT: retq
	%shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 0, i32 undef, i32 2, i32 3, i32 undef, i32 5, i32 6, i32 16, i32 8, i32 9, i32 undef, i32 11, i32 12, i32 13, i32 14>
	ret <16 x i8> %shuffle
	}

	define <16 x i8> @shuffle_v16i8_01_uu_uu_uu_uu_zz_uu_zz_uu_zz_11_zz_13_zz_15_zz(<16 x i8> %a, <16 x i8> %b) {
	; SSE-LABEL: shuffle_v16i8_01_uu_uu_uu_uu_zz_uu_zz_uu_zz_11_zz_13_zz_15_zz:
	; SSE: # BB#0:
	; SSE-NEXT: psrlw $8, %xmm0
	; SSE-NEXT: retq
	;
	; AVX-LABEL: shuffle_v16i8_01_uu_uu_uu_uu_zz_uu_zz_uu_zz_11_zz_13_zz_15_zz:
	; AVX: # BB#0:
	; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0
	; AVX-NEXT: retq
	%shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 undef, i32 16, i32 undef, i32 16, i32 11, i32 16, i32 13, i32 16, i32 15, i32 16>
	ret <16 x i8> %shuffle
	}

	define <16 x i8> @shuffle_v16i8_02_03_zz_zz_06_07_uu_uu_uu_uu_uu_uu_14_15_zz_zz(<16 x i8> %a, <16 x i8> %b) {
	; SSE-LABEL: shuffle_v16i8_02_03_zz_zz_06_07_uu_uu_uu_uu_uu_uu_14_15_zz_zz:
	; SSE: # BB#0:
	; SSE-NEXT: psrld $16, %xmm0
	; SSE-NEXT: retq
	;
	; AVX-LABEL: shuffle_v16i8_02_03_zz_zz_06_07_uu_uu_uu_uu_uu_uu_14_15_zz_zz:
	; AVX: # BB#0:
	; AVX-NEXT: vpsrld $16, %xmm0, %xmm0
	; AVX-NEXT: retq
	%shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 2, i32 3, i32 16, i32 16, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 14, i32 15, i32 16, i32 16>
	ret <16 x i8> %shuffle
	}

	define <16 x i8> @shuffle_v16i8_07_zz_zz_zz_zz_zz_uu_uu_15_uu_uu_uu_uu_uu_zz_zz(<16 x i8> %a, <16 x i8> %b) {
	; SSE-LABEL: shuffle_v16i8_07_zz_zz_zz_zz_zz_uu_uu_15_uu_uu_uu_uu_uu_zz_zz:
	; SSE: # BB#0:
	; SSE-NEXT: psrlq $56, %xmm0
	; SSE-NEXT: retq
	;
	; AVX-LABEL: shuffle_v16i8_07_zz_zz_zz_zz_zz_uu_uu_15_uu_uu_uu_uu_uu_zz_zz:
	; AVX: # BB#0:
	; AVX-NEXT: vpsrlq $56, %xmm0, %xmm0
	; AVX-NEXT: retq
	%shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 7, i32 16, i32 16, i32 16, i32 16, i32 16, i32 undef, i32 undef, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 16>
	ret <16 x i8> %shuffle
	}

	define <16 x i8> @PR12412(<16 x i8> %inval1, <16 x i8> %inval2) {
	; SSE2-LABEL: PR12412:
	; SSE2: # BB#0: # %entry
	; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
	; SSE2-NEXT: pand %xmm2, %xmm1
	; SSE2-NEXT: pand %xmm2, %xmm0
	; SSE2-NEXT: packuswb %xmm1, %xmm0
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: PR12412:
	; SSSE3: # BB#0: # %entry
	; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
	; SSSE3-NEXT: pshufb %xmm2, %xmm1
	; SSSE3-NEXT: pshufb %xmm2, %xmm0
	; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: PR12412:
	; SSE41: # BB#0: # %entry
	; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
	; SSE41-NEXT: pshufb %xmm2, %xmm1
	; SSE41-NEXT: pshufb %xmm2, %xmm0
	; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
	; SSE41-NEXT: retq
	;
	; AVX1OR2-LABEL: PR12412:
	; AVX1OR2: # BB#0: # %entry
	; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
	; AVX1OR2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
	; AVX1OR2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
	; AVX1OR2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
	; AVX1OR2-NEXT: retq
	;
	; AVX512VL-LABEL: PR12412:
	; AVX512VL: # BB#0: # %entry
	; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
	; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
	; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
	; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
	; AVX512VL-NEXT: retq
	entry:
	%0 = shufflevector <16 x i8> %inval1, <16 x i8> %inval2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
	ret <16 x i8> %0
	}

	define <16 x i8> @shuffle_v16i8_uu_02_03_zz_uu_06_07_zz_uu_10_11_zz_uu_14_15_zz(<16 x i8> %a) {
	; SSE-LABEL: shuffle_v16i8_uu_02_03_zz_uu_06_07_zz_uu_10_11_zz_uu_14_15_zz:
	; SSE: # BB#0:
	; SSE-NEXT: psrld $8, %xmm0
	; SSE-NEXT: retq
	;
	; AVX-LABEL: shuffle_v16i8_uu_02_03_zz_uu_06_07_zz_uu_10_11_zz_uu_14_15_zz:
	; AVX: # BB#0:
	; AVX-NEXT: vpsrld $8, %xmm0, %xmm0
	; AVX-NEXT: retq
	%shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 undef, i32 2, i32 3, i32 16, i32 undef, i32 6, i32 7, i32 16, i32 undef, i32 10, i32 11, i32 16, i32 undef, i32 14, i32 15, i32 16>
	ret <16 x i8> %shuffle
	}

	define <16 x i8> @shuffle_v16i8_bitcast_unpack(<16 x i8> %a, <16 x i8> %b) {
	; SSE-LABEL: shuffle_v16i8_bitcast_unpack:
	; SSE: # BB#0:
	; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
	; SSE-NEXT: retq
	;
	; AVX-LABEL: shuffle_v16i8_bitcast_unpack:
	; AVX: # BB#0:
	; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
	; AVX-NEXT: retq
	%shuffle8 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 7, i32 23, i32 6, i32 22, i32 5, i32 21, i32 4, i32 20, i32 3, i32 19, i32 2, i32 18, i32 1, i32 17, i32 0, i32 16>
	%bitcast32 = bitcast <16 x i8> %shuffle8 to <4 x float>
	%shuffle32 = shufflevector <4 x float> %bitcast32, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
	%bitcast16 = bitcast <4 x float> %shuffle32 to <8 x i16>
	%shuffle16 = shufflevector <8 x i16> %bitcast16, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
	%bitcast8 = bitcast <8 x i16> %shuffle16 to <16 x i8>
	ret <16 x i8> %bitcast8
	}

	define <16 x i8> @insert_dup_mem_v16i8_i32(i32* %ptr) {
	; SSE2-LABEL: insert_dup_mem_v16i8_i32:
	; SSE2: # BB#0:
	; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
	; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: insert_dup_mem_v16i8_i32:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
	; SSSE3-NEXT: pxor %xmm1, %xmm1
	; SSSE3-NEXT: pshufb %xmm1, %xmm0
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: insert_dup_mem_v16i8_i32:
	; SSE41: # BB#0:
	; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
	; SSE41-NEXT: pxor %xmm1, %xmm1
	; SSE41-NEXT: pshufb %xmm1, %xmm0
	; SSE41-NEXT: retq
	;
	; AVX1-LABEL: insert_dup_mem_v16i8_i32:
	; AVX1: # BB#0:
	; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
	; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: insert_dup_mem_v16i8_i32:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpbroadcastb (%rdi), %xmm0
	; AVX2OR512VL-NEXT: retq
	%tmp = load i32, i32* %ptr, align 4
	%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
	%tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8>
	%tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <16 x i32> zeroinitializer
	ret <16 x i8> %tmp3
	}

	define <16 x i8> @insert_dup_mem_v16i8_sext_i8(i8* %ptr) {
	; SSE2-LABEL: insert_dup_mem_v16i8_sext_i8:
	; SSE2: # BB#0:
	; SSE2-NEXT: movsbl (%rdi), %eax
	; SSE2-NEXT: movd %eax, %xmm0
	; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: insert_dup_mem_v16i8_sext_i8:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: movsbl (%rdi), %eax
	; SSSE3-NEXT: movd %eax, %xmm0
	; SSSE3-NEXT: pxor %xmm1, %xmm1
	; SSSE3-NEXT: pshufb %xmm1, %xmm0
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: insert_dup_mem_v16i8_sext_i8:
	; SSE41: # BB#0:
	; SSE41-NEXT: movsbl (%rdi), %eax
	; SSE41-NEXT: movd %eax, %xmm0
	; SSE41-NEXT: pxor %xmm1, %xmm1
	; SSE41-NEXT: pshufb %xmm1, %xmm0
	; SSE41-NEXT: retq
	;
	; AVX1-LABEL: insert_dup_mem_v16i8_sext_i8:
	; AVX1: # BB#0:
	; AVX1-NEXT: movsbl (%rdi), %eax
	; AVX1-NEXT: vmovd %eax, %xmm0
	; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: insert_dup_mem_v16i8_sext_i8:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpbroadcastb (%rdi), %xmm0
	; AVX2OR512VL-NEXT: retq
	%tmp = load i8, i8* %ptr, align 1
	%tmp1 = sext i8 %tmp to i32
	%tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0
	%tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8>
	%tmp4 = shufflevector <16 x i8> %tmp3, <16 x i8> undef, <16 x i32> zeroinitializer
	ret <16 x i8> %tmp4
	}

	define <16 x i8> @insert_dup_elt1_mem_v16i8_i32(i32* %ptr) {
	; SSE2-LABEL: insert_dup_elt1_mem_v16i8_i32:
	; SSE2: # BB#0:
	; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
	; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: insert_dup_elt1_mem_v16i8_i32:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
	; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: insert_dup_elt1_mem_v16i8_i32:
	; SSE41: # BB#0:
	; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
	; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
	; SSE41-NEXT: retq
	;
	; AVX1-LABEL: insert_dup_elt1_mem_v16i8_i32:
	; AVX1: # BB#0:
	; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: insert_dup_elt1_mem_v16i8_i32:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpbroadcastb 1(%rdi), %xmm0
	; AVX2OR512VL-NEXT: retq
	%tmp = load i32, i32* %ptr, align 4
	%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
	%tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8>
	%tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
	ret <16 x i8> %tmp3
	}

	define <16 x i8> @insert_dup_elt2_mem_v16i8_i32(i32* %ptr) {
	; SSE2-LABEL: insert_dup_elt2_mem_v16i8_i32:
	; SSE2: # BB#0:
	; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
	; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: insert_dup_elt2_mem_v16i8_i32:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
	; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: insert_dup_elt2_mem_v16i8_i32:
	; SSE41: # BB#0:
	; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
	; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
	; SSE41-NEXT: retq
	;
	; AVX1-LABEL: insert_dup_elt2_mem_v16i8_i32:
	; AVX1: # BB#0:
	; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: insert_dup_elt2_mem_v16i8_i32:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpbroadcastb 2(%rdi), %xmm0
	; AVX2OR512VL-NEXT: retq
	%tmp = load i32, i32* %ptr, align 4
	%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
	%tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8>
	%tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
	ret <16 x i8> %tmp3
	}

	define <16 x i8> @insert_dup_elt1_mem_v16i8_sext_i8(i8* %ptr) {
	; SSE2-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
	; SSE2: # BB#0:
	; SSE2-NEXT: movsbl (%rdi), %eax
	; SSE2-NEXT: movd %eax, %xmm0
	; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: movsbl (%rdi), %eax
	; SSSE3-NEXT: movd %eax, %xmm0
	; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
	; SSE41: # BB#0:
	; SSE41-NEXT: movsbl (%rdi), %eax
	; SSE41-NEXT: movd %eax, %xmm0
	; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
	; SSE41-NEXT: retq
	;
	; AVX1-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
	; AVX1: # BB#0:
	; AVX1-NEXT: movsbl (%rdi), %eax
	; AVX1-NEXT: vmovd %eax, %xmm0
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
	; AVX2: # BB#0:
	; AVX2-NEXT: movsbl (%rdi), %eax
	; AVX2-NEXT: shrl $8, %eax
	; AVX2-NEXT: vmovd %eax, %xmm0
	; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: movsbl (%rdi), %eax
	; AVX512VL-NEXT: shrl $8, %eax
	-; AVX512VL-NEXT: vpbroadcastb %al, %xmm0
	+; AVX512VL-NEXT: vpbroadcastb %eax, %xmm0
	; AVX512VL-NEXT: retq
	%tmp = load i8, i8* %ptr, align 1
	%tmp1 = sext i8 %tmp to i32
	%tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0
	%tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8>
	%tmp4 = shufflevector <16 x i8> %tmp3, <16 x i8> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
	ret <16 x i8> %tmp4
	}

	define <16 x i8> @insert_dup_elt2_mem_v16i8_sext_i8(i8* %ptr) {
	; SSE2-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
	; SSE2: # BB#0:
	; SSE2-NEXT: movsbl (%rdi), %eax
	; SSE2-NEXT: movd %eax, %xmm0
	; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: movsbl (%rdi), %eax
	; SSSE3-NEXT: movd %eax, %xmm0
	; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
	; SSE41: # BB#0:
	; SSE41-NEXT: movsbl (%rdi), %eax
	; SSE41-NEXT: movd %eax, %xmm0
	; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
	; SSE41-NEXT: retq
	;
	; AVX1-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
	; AVX1: # BB#0:
	; AVX1-NEXT: movsbl (%rdi), %eax
	; AVX1-NEXT: vmovd %eax, %xmm0
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
	; AVX2: # BB#0:
	; AVX2-NEXT: movsbl (%rdi), %eax
	; AVX2-NEXT: shrl $16, %eax
	; AVX2-NEXT: vmovd %eax, %xmm0
	; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: movsbl (%rdi), %eax
	; AVX512VL-NEXT: shrl $16, %eax
	-; AVX512VL-NEXT: vpbroadcastb %al, %xmm0
	+; AVX512VL-NEXT: vpbroadcastb %eax, %xmm0
	; AVX512VL-NEXT: retq
	%tmp = load i8, i8* %ptr, align 1
	%tmp1 = sext i8 %tmp to i32
	%tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0
	%tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8>
	%tmp4 = shufflevector <16 x i8> %tmp3, <16 x i8> undef, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
	ret <16 x i8> %tmp4
	}

	define <16 x i8> @PR31364(i8* nocapture readonly %a, i8* nocapture readonly %b) {
	; SSE2-LABEL: PR31364:
	; SSE2: # BB#0:
	; SSE2-NEXT: movzbl (%rdi), %eax
	; SSE2-NEXT: movzbl (%rsi), %ecx
	; SSE2-NEXT: shll $8, %ecx
	; SSE2-NEXT: orl %eax, %ecx
	; SSE2-NEXT: movzwl %cx, %eax
	; SSE2-NEXT: movd %eax, %xmm0
	; SSE2-NEXT: pxor %xmm1, %xmm1
	; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
	; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,3]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,1,1,1,4,5,6,7]
	; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,4,4]
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
	; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,7]
	; SSE2-NEXT: packuswb %xmm1, %xmm0
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: PR31364:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: movzbl (%rdi), %eax
	; SSSE3-NEXT: movzbl (%rsi), %ecx
	; SSSE3-NEXT: shll $8, %ecx
	; SSSE3-NEXT: orl %eax, %ecx
	; SSSE3-NEXT: movzwl %cx, %eax
	; SSSE3-NEXT: movd %eax, %xmm0
	; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1],zero,xmm0[1,1,1,1,1,0,0,0]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: PR31364:
	; SSE41: # BB#0:
	; SSE41-NEXT: pxor %xmm0, %xmm0
	; SSE41-NEXT: pinsrb $0, (%rdi), %xmm0
	; SSE41-NEXT: pinsrb $1, (%rsi), %xmm0
	; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1],zero,xmm0[1,1,1,1,1,0,0,0]
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: PR31364:
	; AVX: # BB#0:
	; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
	; AVX-NEXT: vpinsrb $0, (%rdi), %xmm0, %xmm0
	; AVX-NEXT: vpinsrb $1, (%rsi), %xmm0, %xmm0
	; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1],zero,xmm0[1,1,1,1,1,0,0,0]
	; AVX-NEXT: retq
	%v0 = load i8, i8* %a, align 1
	%vecins = insertelement <16 x i8> <i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %v0, i32 0
	%v1 = load i8, i8* %b, align 1
	%vecins2 = insertelement <16 x i8> %vecins, i8 %v1, i32 1
	%result = shufflevector <16 x i8> %vecins2, <16 x i8> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 3, i32 1, i32 1, i32 1, i32 1, i32 1, i32 0, i32 0, i32 0>
	ret <16 x i8> %result
	}

	define <16 x i8> @PR31301(i8* nocapture readonly %x, i8* nocapture readonly %y) {
	; SSE2-LABEL: PR31301:
	; SSE2: # BB#0: # %entry
	; SSE2-NEXT: movzbl (%rdi), %eax
	; SSE2-NEXT: movd %eax, %xmm0
	; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
	; SSE2-NEXT: movzbl (%rsi), %eax
	; SSE2-NEXT: movd %eax, %xmm1
	; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
	; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
	; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: PR31301:
	; SSSE3: # BB#0: # %entry
	; SSSE3-NEXT: movzbl (%rdi), %eax
	; SSSE3-NEXT: movd %eax, %xmm0
	; SSSE3-NEXT: pxor %xmm1, %xmm1
	; SSSE3-NEXT: pshufb %xmm1, %xmm0
	; SSSE3-NEXT: movzbl (%rsi), %eax
	; SSSE3-NEXT: movd %eax, %xmm2
	; SSSE3-NEXT: pshufb %xmm1, %xmm2
	; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: PR31301:
	; SSE41: # BB#0: # %entry
	; SSE41-NEXT: movzbl (%rdi), %eax
	; SSE41-NEXT: movd %eax, %xmm0
	; SSE41-NEXT: pxor %xmm1, %xmm1
	; SSE41-NEXT: pshufb %xmm1, %xmm0
	; SSE41-NEXT: movzbl (%rsi), %eax
	; SSE41-NEXT: movd %eax, %xmm2
	; SSE41-NEXT: pshufb %xmm1, %xmm2
	; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
	; SSE41-NEXT: retq
	;
	; AVX1-LABEL: PR31301:
	; AVX1: # BB#0: # %entry
	; AVX1-NEXT: movzbl (%rdi), %eax
	; AVX1-NEXT: vmovd %eax, %xmm0
	; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
	; AVX1-NEXT: movzbl (%rsi), %eax
	; AVX1-NEXT: vmovd %eax, %xmm2
	; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
	; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: PR31301:
	; AVX2OR512VL: # BB#0: # %entry
	; AVX2OR512VL-NEXT: vpbroadcastb (%rdi), %xmm0
	; AVX2OR512VL-NEXT: vpbroadcastb (%rsi), %xmm1
	; AVX2OR512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
	; AVX2OR512VL-NEXT: retq
	entry:
	%0 = load i8, i8* %x, align 1
	%1 = insertelement <16 x i8> undef, i8 %0, i32 0
	%lane = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
	%2 = load i8, i8* %y, align 1
	%3 = insertelement <16 x i8> undef, i8 %2, i32 0
	%lane3 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
	%vzip.i = shufflevector <16 x i8> %lane, <16 x i8> %lane3, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
	ret <16 x i8> %vzip.i
	}
	diff --git a/test/CodeGen/X86/vector-shuffle-128-v8.ll b/test/CodeGen/X86/vector-shuffle-128-v8.ll
	index c03b9d1472c1..1cf8453fc6ad 100644
	--- a/test/CodeGen/X86/vector-shuffle-128-v8.ll
	+++ b/test/CodeGen/X86/vector-shuffle-128-v8.ll
	@@ -1,2454 +1,2454 @@
	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc < %s -mtriple=x86_64-unknown-unknown \| FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
	; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 \| FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
	; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 \| FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
	; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx \| FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX1
	; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 \| FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2
	; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw \| FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2OR512VL --check-prefix=AVX512VL

	define <8 x i16> @shuffle_v8i16_01012323(<8 x i16> %a, <8 x i16> %b) {
	; SSE-LABEL: shuffle_v8i16_01012323:
	; SSE: # BB#0:
	; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
	; SSE-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_01012323:
	; AVX: # BB#0:
	; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3>
	ret <8 x i16> %shuffle
	}
	define <8 x i16> @shuffle_v8i16_67452301(<8 x i16> %a, <8 x i16> %b) {
	; SSE-LABEL: shuffle_v8i16_67452301:
	; SSE: # BB#0:
	; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
	; SSE-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_67452301:
	; AVX: # BB#0:
	; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 2, i32 3, i32 0, i32 1>
	ret <8 x i16> %shuffle
	}
	define <8 x i16> @shuffle_v8i16_456789AB(<8 x i16> %a, <8 x i16> %b) {
	; SSE2-LABEL: shuffle_v8i16_456789AB:
	; SSE2: # BB#0:
	; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v8i16_456789AB:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
	; SSSE3-NEXT: movdqa %xmm1, %xmm0
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v8i16_456789AB:
	; SSE41: # BB#0:
	; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
	; SSE41-NEXT: movdqa %xmm1, %xmm0
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_456789AB:
	; AVX: # BB#0:
	; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_00000000(<8 x i16> %a, <8 x i16> %b) {
	; SSE-LABEL: shuffle_v8i16_00000000:
	; SSE: # BB#0:
	; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
	; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
	; SSE-NEXT: retq
	;
	; AVX1-LABEL: shuffle_v8i16_00000000:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v8i16_00000000:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpbroadcastw %xmm0, %xmm0
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <8 x i16> %shuffle
	}
	define <8 x i16> @shuffle_v8i16_00004444(<8 x i16> %a, <8 x i16> %b) {
	; SSE-LABEL: shuffle_v8i16_00004444:
	; SSE: # BB#0:
	; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
	; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
	; SSE-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_00004444:
	; AVX: # BB#0:
	; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
	; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
	ret <8 x i16> %shuffle
	}
	define <8 x i16> @shuffle_v8i16_u0u1u2u3(<8 x i16> %a, <8 x i16> %b) {
	; SSE-LABEL: shuffle_v8i16_u0u1u2u3:
	; SSE: # BB#0:
	; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
	; SSE-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_u0u1u2u3:
	; AVX: # BB#0:
	; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 3>
	ret <8 x i16> %shuffle
	}
	define <8 x i16> @shuffle_v8i16_u4u5u6u7(<8 x i16> %a, <8 x i16> %b) {
	; SSE-LABEL: shuffle_v8i16_u4u5u6u7:
	; SSE: # BB#0:
	; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
	; SSE-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_u4u5u6u7:
	; AVX: # BB#0:
	; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 4, i32 undef, i32 5, i32 undef, i32 6, i32 undef, i32 7>
	ret <8 x i16> %shuffle
	}
	define <8 x i16> @shuffle_v8i16_31206745(<8 x i16> %a, <8 x i16> %b) {
	; SSE-LABEL: shuffle_v8i16_31206745:
	; SSE: # BB#0:
	; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7]
	; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
	; SSE-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_31206745:
	; AVX: # BB#0:
	; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7]
	; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 6, i32 7, i32 4, i32 5>
	ret <8 x i16> %shuffle
	}
	define <8 x i16> @shuffle_v8i16_44440000(<8 x i16> %a, <8 x i16> %b) {
	; SSE2-LABEL: shuffle_v8i16_44440000:
	; SSE2: # BB#0:
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
	; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v8i16_44440000:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v8i16_44440000:
	; SSE41: # BB#0:
	; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1]
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_44440000:
	; AVX: # BB#0:
	; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 0, i32 0, i32 0, i32 0>
	ret <8 x i16> %shuffle
	}
	define <8 x i16> @shuffle_v8i16_23016745(<8 x i16> %a, <8 x i16> %b) {
	; SSE-LABEL: shuffle_v8i16_23016745:
	; SSE: # BB#0:
	; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,3,2]
	; SSE-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_23016745:
	; AVX: # BB#0:
	; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
	ret <8 x i16> %shuffle
	}
	define <8 x i16> @shuffle_v8i16_23026745(<8 x i16> %a, <8 x i16> %b) {
	; SSE-LABEL: shuffle_v8i16_23026745:
	; SSE: # BB#0:
	; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,3,0,2,4,5,6,7]
	; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
	; SSE-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_23026745:
	; AVX: # BB#0:
	; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,3,0,2,4,5,6,7]
	; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 3, i32 0, i32 2, i32 6, i32 7, i32 4, i32 5>
	ret <8 x i16> %shuffle
	}
	define <8 x i16> @shuffle_v8i16_23016747(<8 x i16> %a, <8 x i16> %b) {
	; SSE-LABEL: shuffle_v8i16_23016747:
	; SSE: # BB#0:
	; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,2,3]
	; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,7]
	; SSE-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_23016747:
	; AVX: # BB#0:
	; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,3]
	; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,7]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 7>
	ret <8 x i16> %shuffle
	}
	define <8 x i16> @shuffle_v8i16_75643120(<8 x i16> %a, <8 x i16> %b) {
	; SSE2-LABEL: shuffle_v8i16_75643120:
	; SSE2: # BB#0:
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7]
	; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,4]
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v8i16_75643120:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[14,15,10,11,12,13,8,9,6,7,2,3,4,5,0,1]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v8i16_75643120:
	; SSE41: # BB#0:
	; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[14,15,10,11,12,13,8,9,6,7,2,3,4,5,0,1]
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_75643120:
	; AVX: # BB#0:
	; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,10,11,12,13,8,9,6,7,2,3,4,5,0,1]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 7, i32 5, i32 6, i32 4, i32 3, i32 1, i32 2, i32 0>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_10545410(<8 x i16> %a, <8 x i16> %b) {
	; SSE2-LABEL: shuffle_v8i16_10545410:
	; SSE2: # BB#0:
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,0]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
	; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v8i16_10545410:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,10,11,8,9,10,11,8,9,2,3,0,1]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v8i16_10545410:
	; SSE41: # BB#0:
	; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,10,11,8,9,10,11,8,9,2,3,0,1]
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_10545410:
	; AVX: # BB#0:
	; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,10,11,8,9,10,11,8,9,2,3,0,1]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 0, i32 5, i32 4, i32 5, i32 4, i32 1, i32 0>
	ret <8 x i16> %shuffle
	}
	define <8 x i16> @shuffle_v8i16_54105410(<8 x i16> %a, <8 x i16> %b) {
	; SSE2-LABEL: shuffle_v8i16_54105410:
	; SSE2: # BB#0:
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,0]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
	; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v8i16_54105410:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,10,11,8,9,2,3,0,1]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v8i16_54105410:
	; SSE41: # BB#0:
	; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,10,11,8,9,2,3,0,1]
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_54105410:
	; AVX: # BB#0:
	; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,10,11,8,9,2,3,0,1]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 5, i32 4, i32 1, i32 0, i32 5, i32 4, i32 1, i32 0>
	ret <8 x i16> %shuffle
	}
	define <8 x i16> @shuffle_v8i16_54101054(<8 x i16> %a, <8 x i16> %b) {
	; SSE2-LABEL: shuffle_v8i16_54101054:
	; SSE2: # BB#0:
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,0]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
	; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v8i16_54101054:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,2,3,0,1,10,11,8,9]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v8i16_54101054:
	; SSE41: # BB#0:
	; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,2,3,0,1,10,11,8,9]
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_54101054:
	; AVX: # BB#0:
	; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,2,3,0,1,10,11,8,9]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 5, i32 4, i32 1, i32 0, i32 1, i32 0, i32 5, i32 4>
	ret <8 x i16> %shuffle
	}
	define <8 x i16> @shuffle_v8i16_04400440(<8 x i16> %a, <8 x i16> %b) {
	; SSE2-LABEL: shuffle_v8i16_04400440:
	; SSE2: # BB#0:
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,0]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7]
	; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,4,6]
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v8i16_04400440:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,8,9,8,9,0,1]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v8i16_04400440:
	; SSE41: # BB#0:
	; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,8,9,8,9,0,1]
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_04400440:
	; AVX: # BB#0:
	; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,8,9,8,9,0,1]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 4, i32 4, i32 0, i32 0, i32 4, i32 4, i32 0>
	ret <8 x i16> %shuffle
	}
	define <8 x i16> @shuffle_v8i16_40044004(<8 x i16> %a, <8 x i16> %b) {
	; SSE2-LABEL: shuffle_v8i16_40044004:
	; SSE2: # BB#0:
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,0]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,0,2,4,5,6,7]
	; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,4]
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v8i16_40044004:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,0,1,0,1,8,9,8,9,0,1,0,1,8,9]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v8i16_40044004:
	; SSE41: # BB#0:
	; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,0,1,0,1,8,9,8,9,0,1,0,1,8,9]
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_40044004:
	; AVX: # BB#0:
	; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,0,1,0,1,8,9,8,9,0,1,0,1,8,9]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 0, i32 0, i32 4, i32 4, i32 0, i32 0, i32 4>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_26405173(<8 x i16> %a, <8 x i16> %b) {
	; SSE2-LABEL: shuffle_v8i16_26405173:
	; SSE2: # BB#0:
	; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
	; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,4]
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7]
	; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,4,7]
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v8i16_26405173:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,8,9,0,1,10,11,2,3,14,15,6,7]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v8i16_26405173:
	; SSE41: # BB#0:
	; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,8,9,0,1,10,11,2,3,14,15,6,7]
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_26405173:
	; AVX: # BB#0:
	; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,8,9,0,1,10,11,2,3,14,15,6,7]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 6, i32 4, i32 0, i32 5, i32 1, i32 7, i32 3>
	ret <8 x i16> %shuffle
	}
	define <8 x i16> @shuffle_v8i16_20645173(<8 x i16> %a, <8 x i16> %b) {
	; SSE2-LABEL: shuffle_v8i16_20645173:
	; SSE2: # BB#0:
	; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
	; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,4]
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,2,3,4,5,6,7]
	; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,4,7]
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v8i16_20645173:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,10,11,2,3,14,15,6,7]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v8i16_20645173:
	; SSE41: # BB#0:
	; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,10,11,2,3,14,15,6,7]
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_20645173:
	; AVX: # BB#0:
	; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,10,11,2,3,14,15,6,7]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 0, i32 6, i32 4, i32 5, i32 1, i32 7, i32 3>
	ret <8 x i16> %shuffle
	}
	define <8 x i16> @shuffle_v8i16_26401375(<8 x i16> %a, <8 x i16> %b) {
	; SSE2-LABEL: shuffle_v8i16_26401375:
	; SSE2: # BB#0:
	; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
	; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,4]
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,2]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7]
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v8i16_26401375:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,8,9,0,1,2,3,6,7,14,15,10,11]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v8i16_26401375:
	; SSE41: # BB#0:
	; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,8,9,0,1,2,3,6,7,14,15,10,11]
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_26401375:
	; AVX: # BB#0:
	; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,8,9,0,1,2,3,6,7,14,15,10,11]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 6, i32 4, i32 0, i32 1, i32 3, i32 7, i32 5>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_66751643(<8 x i16> %a, <8 x i16> %b) {
	; SSE2-LABEL: shuffle_v8i16_66751643:
	; SSE2: # BB#0:
	; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
	; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7]
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,0]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,3,2,4,5,6,7]
	; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,4,6]
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v8i16_66751643:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v8i16_66751643:
	; SSE41: # BB#0:
	; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7]
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_66751643:
	; AVX: # BB#0:
	; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 6, i32 6, i32 7, i32 5, i32 1, i32 6, i32 4, i32 3>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_60514754(<8 x i16> %a, <8 x i16> %b) {
	; SSE2-LABEL: shuffle_v8i16_60514754:
	; SSE2: # BB#0:
	; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,3,1,4,5,6,7]
	; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,5,6]
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v8i16_60514754:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[12,13,0,1,10,11,2,3,8,9,14,15,10,11,8,9]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v8i16_60514754:
	; SSE41: # BB#0:
	; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[12,13,0,1,10,11,2,3,8,9,14,15,10,11,8,9]
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_60514754:
	; AVX: # BB#0:
	; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,0,1,10,11,2,3,8,9,14,15,10,11,8,9]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 6, i32 0, i32 5, i32 1, i32 4, i32 7, i32 5, i32 4>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_00444444(<8 x i16> %a, <8 x i16> %b) {
	; SSE2-LABEL: shuffle_v8i16_00444444:
	; SSE2: # BB#0:
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
	; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v8i16_00444444:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,8,9,8,9,8,9,8,9,8,9,8,9]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v8i16_00444444:
	; SSE41: # BB#0:
	; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,8,9,8,9,8,9,8,9,8,9,8,9]
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_00444444:
	; AVX: # BB#0:
	; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,8,9,8,9,8,9,8,9,8,9,8,9]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
	ret <8 x i16> %shuffle
	}
	define <8 x i16> @shuffle_v8i16_44004444(<8 x i16> %a, <8 x i16> %b) {
	; SSE2-LABEL: shuffle_v8i16_44004444:
	; SSE2: # BB#0:
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,0,0,4,5,6,7]
	; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v8i16_44004444:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,0,1,0,1,8,9,8,9,8,9,8,9]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v8i16_44004444:
	; SSE41: # BB#0:
	; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,0,1,0,1,8,9,8,9,8,9,8,9]
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_44004444:
	; AVX: # BB#0:
	; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,0,1,0,1,8,9,8,9,8,9,8,9]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 4, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
	ret <8 x i16> %shuffle
	}
	define <8 x i16> @shuffle_v8i16_04404444(<8 x i16> %a, <8 x i16> %b) {
	; SSE2-LABEL: shuffle_v8i16_04404444:
	; SSE2: # BB#0:
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7]
	; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v8i16_04404444:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v8i16_04404444:
	; SSE41: # BB#0:
	; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_04404444:
	; AVX: # BB#0:
	; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 4, i32 4, i32 0, i32 4, i32 4, i32 4, i32 4>
	ret <8 x i16> %shuffle
	}
	define <8 x i16> @shuffle_v8i16_04400000(<8 x i16> %a, <8 x i16> %b) {
	; SSE2-LABEL: shuffle_v8i16_04400000:
	; SSE2: # BB#0:
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,0,3]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7]
	; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v8i16_04400000:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,0,1,0,1,0,1]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v8i16_04400000:
	; SSE41: # BB#0:
	; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,0,1,0,1,0,1]
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_04400000:
	; AVX: # BB#0:
	; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,0,1,0,1,0,1]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 4, i32 4, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <8 x i16> %shuffle
	}
	define <8 x i16> @shuffle_v8i16_04404567(<8 x i16> %a, <8 x i16> %b) {
	; SSE-LABEL: shuffle_v8i16_04404567:
	; SSE: # BB#0:
	; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
	; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7]
	; SSE-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_04404567:
	; AVX: # BB#0:
	; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
	; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 4, i32 4, i32 0, i32 4, i32 5, i32 6, i32 7>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_0X444444(<8 x i16> %a, <8 x i16> %b) {
	; SSE2-LABEL: shuffle_v8i16_0X444444:
	; SSE2: # BB#0:
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7]
	; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v8i16_0X444444:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,8,9,8,9,8,9,8,9,8,9]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v8i16_0X444444:
	; SSE41: # BB#0:
	; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,8,9,8,9,8,9,8,9,8,9]
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_0X444444:
	; AVX: # BB#0:
	; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,8,9,8,9,8,9,8,9,8,9]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 undef, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
	ret <8 x i16> %shuffle
	}
	define <8 x i16> @shuffle_v8i16_44X04444(<8 x i16> %a, <8 x i16> %b) {
	; SSE2-LABEL: shuffle_v8i16_44X04444:
	; SSE2: # BB#0:
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,0,4,5,6,7]
	; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v8i16_44X04444:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v8i16_44X04444:
	; SSE41: # BB#0:
	; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_44X04444:
	; AVX: # BB#0:
	; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 4, i32 undef, i32 0, i32 4, i32 4, i32 4, i32 4>
	ret <8 x i16> %shuffle
	}
	define <8 x i16> @shuffle_v8i16_X4404444(<8 x i16> %a, <8 x i16> %b) {
	; SSE2-LABEL: shuffle_v8i16_X4404444:
	; SSE2: # BB#0:
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7]
	; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v8i16_X4404444:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v8i16_X4404444:
	; SSE41: # BB#0:
	; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_X4404444:
	; AVX: # BB#0:
	; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 4, i32 4, i32 0, i32 4, i32 4, i32 4, i32 4>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_0127XXXX(<8 x i16> %a, <8 x i16> %b) {
	; SSE2-LABEL: shuffle_v8i16_0127XXXX:
	; SSE2: # BB#0:
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
	; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v8i16_0127XXXX:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,4,5,14,15,12,13,14,15]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v8i16_0127XXXX:
	; SSE41: # BB#0:
	; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,4,5,14,15,12,13,14,15]
	; SSE41-NEXT: retq
	;
	; AVX1OR2-LABEL: shuffle_v8i16_0127XXXX:
	; AVX1OR2: # BB#0:
	; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,4,5,14,15,12,13,14,15]
	; AVX1OR2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v8i16_0127XXXX:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
	; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
	; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_XXXX4563(<8 x i16> %a, <8 x i16> %b) {
	; SSE2-LABEL: shuffle_v8i16_XXXX4563:
	; SSE2: # BB#0:
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v8i16_XXXX4563:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[12,13,6,7,4,5,6,7,8,9,10,11,12,13,6,7]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v8i16_XXXX4563:
	; SSE41: # BB#0:
	; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[12,13,6,7,4,5,6,7,8,9,10,11,12,13,6,7]
	; SSE41-NEXT: retq
	;
	; AVX1OR2-LABEL: shuffle_v8i16_XXXX4563:
	; AVX1OR2: # BB#0:
	; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,6,7,4,5,6,7,8,9,10,11,12,13,6,7]
	; AVX1OR2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v8i16_XXXX4563:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
	; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
	; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 4, i32 5, i32 6, i32 3>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_4563XXXX(<8 x i16> %a, <8 x i16> %b) {
	; SSE2-LABEL: shuffle_v8i16_4563XXXX:
	; SSE2: # BB#0:
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v8i16_4563XXXX:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,8,9,10,11,0,1,2,3]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v8i16_4563XXXX:
	; SSE41: # BB#0:
	; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,8,9,10,11,0,1,2,3]
	; SSE41-NEXT: retq
	;
	; AVX1OR2-LABEL: shuffle_v8i16_4563XXXX:
	; AVX1OR2: # BB#0:
	; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,8,9,10,11,0,1,2,3]
	; AVX1OR2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v8i16_4563XXXX:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
	; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
	; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_01274563(<8 x i16> %a, <8 x i16> %b) {
	; SSE2-LABEL: shuffle_v8i16_01274563:
	; SSE2: # BB#0:
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
	; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,2]
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v8i16_01274563:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,8,9,10,11,12,13,6,7]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v8i16_01274563:
	; SSE41: # BB#0:
	; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,8,9,10,11,12,13,6,7]
	; SSE41-NEXT: retq
	;
	; AVX1OR2-LABEL: shuffle_v8i16_01274563:
	; AVX1OR2: # BB#0:
	; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,8,9,10,11,12,13,6,7]
	; AVX1OR2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v8i16_01274563:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
	; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
	; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,1,2]
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 7, i32 4, i32 5, i32 6, i32 3>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_45630127(<8 x i16> %a, <8 x i16> %b) {
	; SSE2-LABEL: shuffle_v8i16_45630127:
	; SSE2: # BB#0:
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1]
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v8i16_45630127:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,14,15]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v8i16_45630127:
	; SSE41: # BB#0:
	; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,14,15]
	; SSE41-NEXT: retq
	;
	; AVX1OR2-LABEL: shuffle_v8i16_45630127:
	; AVX1OR2: # BB#0:
	; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,14,15]
	; AVX1OR2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v8i16_45630127:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
	; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
	; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,0,3,1]
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 3, i32 0, i32 1, i32 2, i32 7>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_37102735(<8 x i16> %a, <8 x i16> %b) {
	; SSE2-LABEL: shuffle_v8i16_37102735:
	; SSE2: # BB#0:
	; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7]
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
	; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,4]
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
	; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,5,6]
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v8i16_37102735:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v8i16_37102735:
	; SSE41: # BB#0:
	; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11]
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_37102735:
	; AVX: # BB#0:
	; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 7, i32 1, i32 0, i32 2, i32 7, i32 3, i32 5>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_08192a3b(<8 x i16> %a, <8 x i16> %b) {
	; SSE-LABEL: shuffle_v8i16_08192a3b:
	; SSE: # BB#0:
	; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
	; SSE-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_08192a3b:
	; AVX: # BB#0:
	; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_0c1d2e3f(<8 x i16> %a, <8 x i16> %b) {
	; SSE-LABEL: shuffle_v8i16_0c1d2e3f:
	; SSE: # BB#0:
	; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
	; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
	; SSE-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_0c1d2e3f:
	; AVX: # BB#0:
	; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
	; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 12, i32 1, i32 13, i32 2, i32 14, i32 3, i32 15>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_4c5d6e7f(<8 x i16> %a, <8 x i16> %b) {
	; SSE-LABEL: shuffle_v8i16_4c5d6e7f:
	; SSE: # BB#0:
	; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
	; SSE-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_4c5d6e7f:
	; AVX: # BB#0:
	; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_48596a7b(<8 x i16> %a, <8 x i16> %b) {
	; SSE-LABEL: shuffle_v8i16_48596a7b:
	; SSE: # BB#0:
	; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
	; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
	; SSE-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_48596a7b:
	; AVX: # BB#0:
	; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
	; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 8, i32 5, i32 9, i32 6, i32 10, i32 7, i32 11>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_08196e7f(<8 x i16> %a, <8 x i16> %b) {
	; SSE-LABEL: shuffle_v8i16_08196e7f:
	; SSE: # BB#0:
	; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
	; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
	; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
	; SSE-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_08196e7f:
	; AVX: # BB#0:
	; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
	; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
	; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 6, i32 14, i32 7, i32 15>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_0c1d6879(<8 x i16> %a, <8 x i16> %b) {
	; SSE-LABEL: shuffle_v8i16_0c1d6879:
	; SSE: # BB#0:
	; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,0,2,3]
	; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
	; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
	; SSE-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_0c1d6879:
	; AVX: # BB#0:
	; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,0,2,3]
	; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
	; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 12, i32 1, i32 13, i32 6, i32 8, i32 7, i32 9>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_109832ba(<8 x i16> %a, <8 x i16> %b) {
	; SSE-LABEL: shuffle_v8i16_109832ba:
	; SSE: # BB#0:
	; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
	; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,3,1,4,5,6,7]
	; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,7,5]
	; SSE-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_109832ba:
	; AVX: # BB#0:
	; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
	; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,3,1,4,5,6,7]
	; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,7,5]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 0, i32 9, i32 8, i32 3, i32 2, i32 11, i32 10>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_8091a2b3(<8 x i16> %a, <8 x i16> %b) {
	; SSE-LABEL: shuffle_v8i16_8091a2b3:
	; SSE: # BB#0:
	; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
	; SSE-NEXT: movdqa %xmm1, %xmm0
	; SSE-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_8091a2b3:
	; AVX: # BB#0:
	; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 8, i32 0, i32 9, i32 1, i32 10, i32 2, i32 11, i32 3>
	ret <8 x i16> %shuffle
	}
	define <8 x i16> @shuffle_v8i16_c4d5e6f7(<8 x i16> %a, <8 x i16> %b) {
	; SSE-LABEL: shuffle_v8i16_c4d5e6f7:
	; SSE: # BB#0:
	; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
	; SSE-NEXT: movdqa %xmm1, %xmm0
	; SSE-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_c4d5e6f7:
	; AVX: # BB#0:
	; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 12, i32 4, i32 13, i32 5, i32 14, i32 6, i32 15, i32 7>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_0213cedf(<8 x i16> %a, <8 x i16> %b) {
	; SSE-LABEL: shuffle_v8i16_0213cedf:
	; SSE: # BB#0:
	; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
	; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
	; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,1,3,4,5,6,7]
	; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
	; SSE-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_0213cedf:
	; AVX: # BB#0:
	; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
	; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
	; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,1,3,4,5,6,7]
	; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 1, i32 3, i32 12, i32 14, i32 13, i32 15>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_443aXXXX(<8 x i16> %a, <8 x i16> %b) {
	; SSE2-LABEL: shuffle_v8i16_443aXXXX:
	; SSE2: # BB#0:
	; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,65535,65535,65535,65535]
	; SSE2-NEXT: pand %xmm2, %xmm0
	; SSE2-NEXT: pandn %xmm1, %xmm2
	; SSE2-NEXT: por %xmm0, %xmm2
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,1,2,3]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7]
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v8i16_443aXXXX:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[4,5,u,u,u,u,u,u,u,u]
	; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,6,7],zero,zero,xmm0[u,u,u,u,u,u,u,u]
	; SSSE3-NEXT: por %xmm1, %xmm0
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v8i16_443aXXXX:
	; SSE41: # BB#0:
	; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7]
	; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
	; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7]
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_443aXXXX:
	; AVX: # BB#0:
	; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7]
	; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
	; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 4, i32 3, i32 10, i32 undef, i32 undef, i32 undef, i32 undef>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_032dXXXX(<8 x i16> %a, <8 x i16> %b) {
	; SSE2-LABEL: shuffle_v8i16_032dXXXX:
	; SSE2: # BB#0:
	; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,0]
	; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v8i16_032dXXXX:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[10,11,u,u,u,u,u,u,u,u]
	; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5],zero,zero,xmm0[u,u,u,u,u,u,u,u]
	; SSSE3-NEXT: por %xmm1, %xmm0
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v8i16_032dXXXX:
	; SSE41: # BB#0:
	; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
	; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,10,11,0,1,10,11,0,1,2,3]
	; SSE41-NEXT: retq
	;
	; AVX1-LABEL: shuffle_v8i16_032dXXXX:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,10,11,0,1,10,11,0,1,2,3]
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v8i16_032dXXXX:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,10,11,0,1,10,11,0,1,2,3]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 3, i32 2, i32 13, i32 undef, i32 undef, i32 undef, i32 undef>
	ret <8 x i16> %shuffle
	}
	define <8 x i16> @shuffle_v8i16_XXXdXXXX(<8 x i16> %a, <8 x i16> %b) {
	; SSE-LABEL: shuffle_v8i16_XXXdXXXX:
	; SSE: # BB#0:
	; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3]
	; SSE-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_XXXdXXXX:
	; AVX: # BB#0:
	; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,2,3,3]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 13, i32 undef, i32 undef, i32 undef, i32 undef>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_012dXXXX(<8 x i16> %a, <8 x i16> %b) {
	; SSE2-LABEL: shuffle_v8i16_012dXXXX:
	; SSE2: # BB#0:
	; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535]
	; SSE2-NEXT: pand %xmm2, %xmm0
	; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
	; SSE2-NEXT: pandn %xmm1, %xmm2
	; SSE2-NEXT: por %xmm2, %xmm0
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v8i16_012dXXXX:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[10,11,u,u,u,u,u,u,u,u]
	; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],zero,zero,xmm0[u,u,u,u,u,u,u,u]
	; SSSE3-NEXT: por %xmm1, %xmm0
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v8i16_012dXXXX:
	; SSE41: # BB#0:
	; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
	; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_012dXXXX:
	; AVX: # BB#0:
	; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
	; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 13, i32 undef, i32 undef, i32 undef, i32 undef>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_XXXXcde3(<8 x i16> %a, <8 x i16> %b) {
	; SSE2-LABEL: shuffle_v8i16_XXXXcde3:
	; SSE2: # BB#0:
	; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,65535,65535,0]
	; SSE2-NEXT: pand %xmm2, %xmm1
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
	; SSE2-NEXT: pandn %xmm0, %xmm2
	; SSE2-NEXT: por %xmm1, %xmm2
	; SSE2-NEXT: movdqa %xmm2, %xmm0
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v8i16_XXXXcde3:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,zero,xmm0[6,7]
	; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,8,9,10,11,12,13],zero,zero
	; SSSE3-NEXT: por %xmm1, %xmm0
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v8i16_XXXXcde3:
	; SSE41: # BB#0:
	; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
	; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7]
	; SSE41-NEXT: retq
	;
	; AVX1-LABEL: shuffle_v8i16_XXXXcde3:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
	; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7]
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v8i16_XXXXcde3:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpbroadcastq %xmm0, %xmm0
	; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 3>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_cde3XXXX(<8 x i16> %a, <8 x i16> %b) {
	; SSE2-LABEL: shuffle_v8i16_cde3XXXX:
	; SSE2: # BB#0:
	; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535]
	; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
	; SSE2-NEXT: pand %xmm2, %xmm1
	; SSE2-NEXT: pandn %xmm0, %xmm2
	; SSE2-NEXT: por %xmm1, %xmm2
	; SSE2-NEXT: movdqa %xmm2, %xmm0
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v8i16_cde3XXXX:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[6,7,u,u,u,u,u,u,u,u]
	; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13],zero,zero,xmm1[u,u,u,u,u,u,u,u]
	; SSSE3-NEXT: por %xmm1, %xmm0
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v8i16_cde3XXXX:
	; SSE41: # BB#0:
	; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
	; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7]
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_cde3XXXX:
	; AVX: # BB#0:
	; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
	; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 12, i32 13, i32 14, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_012dcde3(<8 x i16> %a, <8 x i16> %b) {
	; SSE2-LABEL: shuffle_v8i16_012dcde3:
	; SSE2: # BB#0:
	; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,2,1]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7]
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
	; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,7]
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,0,2,4,5,6,7]
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v8i16_012dcde3:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[10,11,8,9,10,11,12,13],zero,zero
	; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],zero,zero,zero,zero,zero,zero,zero,zero,xmm0[6,7]
	; SSSE3-NEXT: por %xmm1, %xmm0
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v8i16_012dcde3:
	; SSE41: # BB#0:
	; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
	; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,10,11,8,9,10,11,12,13,6,7]
	; SSE41-NEXT: retq
	;
	; AVX1-LABEL: shuffle_v8i16_012dcde3:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,10,11,8,9,10,11,12,13,6,7]
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v8i16_012dcde3:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,10,11,8,9,10,11,12,13,6,7]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 13, i32 12, i32 13, i32 14, i32 3>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_0923cde7(<8 x i16> %a, <8 x i16> %b) {
	; SSE2-LABEL: shuffle_v8i16_0923cde7:
	; SSE2: # BB#0:
	; SSE2-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,65535,0,0,0,65535]
	; SSE2-NEXT: andps %xmm2, %xmm0
	; SSE2-NEXT: andnps %xmm1, %xmm2
	; SSE2-NEXT: orps %xmm2, %xmm0
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v8i16_0923cde7:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,65535,0,0,0,65535]
	; SSSE3-NEXT: andps %xmm2, %xmm0
	; SSSE3-NEXT: andnps %xmm1, %xmm2
	; SSSE3-NEXT: orps %xmm2, %xmm0
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v8i16_0923cde7:
	; SSE41: # BB#0:
	; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4,5,6],xmm0[7]
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_0923cde7:
	; AVX: # BB#0:
	; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4,5,6],xmm0[7]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 9, i32 2, i32 3, i32 12, i32 13, i32 14, i32 7>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_XXX1X579(<8 x i16> %a, <8 x i16> %b) {
	; SSE2-LABEL: shuffle_v8i16_XXX1X579:
	; SSE2: # BB#0:
	; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,2,0]
	; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,65535,65535,0]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
	; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
	; SSE2-NEXT: pand %xmm1, %xmm0
	; SSE2-NEXT: pandn %xmm2, %xmm1
	; SSE2-NEXT: por %xmm0, %xmm1
	; SSE2-NEXT: movdqa %xmm1, %xmm0
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v8i16_XXX1X579:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u],zero,zero,xmm1[u,u],zero,zero,zero,zero,xmm1[2,3]
	; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,2,3,u,u,10,11,14,15],zero,zero
	; SSSE3-NEXT: por %xmm1, %xmm0
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v8i16_XXX1X579:
	; SSE41: # BB#0:
	; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
	; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
	; SSE41-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
	; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
	; SSE41-NEXT: retq
	;
	; AVX1-LABEL: shuffle_v8i16_XXX1X579:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
	; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
	; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v8i16_XXX1X579:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpbroadcastd %xmm1, %xmm1
	; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
	; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
	; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 5, i32 7, i32 9>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_XX4X8acX(<8 x i16> %a, <8 x i16> %b) {
	; SSE2-LABEL: shuffle_v8i16_XX4X8acX:
	; SSE2: # BB#0:
	; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
	; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,7]
	; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v8i16_XX4X8acX:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,8,9,u,u],zero,zero,zero,zero,zero,zero,xmm0[u,u]
	; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,xmm1[u,u,0,1,4,5,8,9,u,u]
	; SSSE3-NEXT: por %xmm1, %xmm0
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v8i16_XX4X8acX:
	; SSE41: # BB#0:
	; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,4,5]
	; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
	; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
	; SSE41-NEXT: retq
	;
	; AVX1-LABEL: shuffle_v8i16_XX4X8acX:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,4,5]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
	; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v8i16_XX4X8acX:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,4,5]
	; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
	; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 4, i32 undef, i32 8, i32 10, i32 12, i32 undef>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_8zzzzzzz(i16 %i) {
	; SSE-LABEL: shuffle_v8i16_8zzzzzzz:
	; SSE: # BB#0:
	; SSE-NEXT: movzwl %di, %eax
	; SSE-NEXT: movd %eax, %xmm0
	; SSE-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_8zzzzzzz:
	; AVX: # BB#0:
	; AVX-NEXT: movzwl %di, %eax
	; AVX-NEXT: vmovd %eax, %xmm0
	; AVX-NEXT: retq
	%a = insertelement <8 x i16> undef, i16 %i, i32 0
	%shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_z8zzzzzz(i16 %i) {
	; SSE-LABEL: shuffle_v8i16_z8zzzzzz:
	; SSE: # BB#0:
	; SSE-NEXT: pxor %xmm0, %xmm0
	; SSE-NEXT: pinsrw $1, %edi, %xmm0
	; SSE-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_z8zzzzzz:
	; AVX: # BB#0:
	; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
	; AVX-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0
	; AVX-NEXT: retq
	%a = insertelement <8 x i16> undef, i16 %i, i32 0
	%shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 2, i32 8, i32 3, i32 7, i32 6, i32 5, i32 4, i32 3>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_zzzzz8zz(i16 %i) {
	; SSE-LABEL: shuffle_v8i16_zzzzz8zz:
	; SSE: # BB#0:
	; SSE-NEXT: pxor %xmm0, %xmm0
	; SSE-NEXT: pinsrw $5, %edi, %xmm0
	; SSE-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_zzzzz8zz:
	; AVX: # BB#0:
	; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
	; AVX-NEXT: vpinsrw $5, %edi, %xmm0, %xmm0
	; AVX-NEXT: retq
	%a = insertelement <8 x i16> undef, i16 %i, i32 0
	%shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_zuuzuuz8(i16 %i) {
	; SSE-LABEL: shuffle_v8i16_zuuzuuz8:
	; SSE: # BB#0:
	; SSE-NEXT: pxor %xmm0, %xmm0
	; SSE-NEXT: pinsrw $7, %edi, %xmm0
	; SSE-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_zuuzuuz8:
	; AVX: # BB#0:
	; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
	; AVX-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0
	; AVX-NEXT: retq
	%a = insertelement <8 x i16> undef, i16 %i, i32 0
	%shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 8>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_zzBzzzzz(i16 %i) {
	; SSE-LABEL: shuffle_v8i16_zzBzzzzz:
	; SSE: # BB#0:
	; SSE-NEXT: pxor %xmm0, %xmm0
	; SSE-NEXT: pinsrw $2, %edi, %xmm0
	; SSE-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_zzBzzzzz:
	; AVX: # BB#0:
	; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
	; AVX-NEXT: vpinsrw $2, %edi, %xmm0, %xmm0
	; AVX-NEXT: retq
	%a = insertelement <8 x i16> undef, i16 %i, i32 3
	%shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 0, i32 1, i32 11, i32 3, i32 4, i32 5, i32 6, i32 7>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_def01234(<8 x i16> %a, <8 x i16> %b) {
	; SSE2-LABEL: shuffle_v8i16_def01234:
	; SSE2: # BB#0:
	; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
	; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
	; SSE2-NEXT: por %xmm1, %xmm0
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v8i16_def01234:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v8i16_def01234:
	; SSE41: # BB#0:
	; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9]
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_def01234:
	; AVX: # BB#0:
	; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_ueuu123u(<8 x i16> %a, <8 x i16> %b) {
	; SSE2-LABEL: shuffle_v8i16_ueuu123u:
	; SSE2: # BB#0:
	; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
	; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
	; SSE2-NEXT: por %xmm1, %xmm0
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v8i16_ueuu123u:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v8i16_ueuu123u:
	; SSE41: # BB#0:
	; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9]
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_ueuu123u:
	; AVX: # BB#0:
	; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 14, i32 undef, i32 undef, i32 1, i32 2, i32 3, i32 undef>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_56701234(<8 x i16> %a, <8 x i16> %b) {
	; SSE2-LABEL: shuffle_v8i16_56701234:
	; SSE2: # BB#0:
	; SSE2-NEXT: movdqa %xmm0, %xmm1
	; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
	; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
	; SSE2-NEXT: por %xmm1, %xmm0
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v8i16_56701234:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v8i16_56701234:
	; SSE41: # BB#0:
	; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_56701234:
	; AVX: # BB#0:
	; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_u6uu123u(<8 x i16> %a, <8 x i16> %b) {
	; SSE2-LABEL: shuffle_v8i16_u6uu123u:
	; SSE2: # BB#0:
	; SSE2-NEXT: movdqa %xmm0, %xmm1
	; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
	; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
	; SSE2-NEXT: por %xmm1, %xmm0
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v8i16_u6uu123u:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v8i16_u6uu123u:
	; SSE41: # BB#0:
	; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_u6uu123u:
	; AVX: # BB#0:
	; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 6, i32 undef, i32 undef, i32 1, i32 2, i32 3, i32 undef>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_uuuu123u(<8 x i16> %a, <8 x i16> %b) {
	; SSE-LABEL: shuffle_v8i16_uuuu123u:
	; SSE: # BB#0:
	; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
	; SSE-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_uuuu123u:
	; AVX: # BB#0:
	; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 2, i32 3, i32 undef>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_bcdef012(<8 x i16> %a, <8 x i16> %b) {
	; SSE2-LABEL: shuffle_v8i16_bcdef012:
	; SSE2: # BB#0:
	; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
	; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
	; SSE2-NEXT: por %xmm1, %xmm0
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v8i16_bcdef012:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v8i16_bcdef012:
	; SSE41: # BB#0:
	; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5]
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_bcdef012:
	; AVX: # BB#0:
	; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_ucdeuu1u(<8 x i16> %a, <8 x i16> %b) {
	; SSE2-LABEL: shuffle_v8i16_ucdeuu1u:
	; SSE2: # BB#0:
	; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
	; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
	; SSE2-NEXT: por %xmm1, %xmm0
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v8i16_ucdeuu1u:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v8i16_ucdeuu1u:
	; SSE41: # BB#0:
	; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5]
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_ucdeuu1u:
	; AVX: # BB#0:
	; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 12, i32 13, i32 14, i32 undef, i32 undef, i32 1, i32 undef>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_34567012(<8 x i16> %a, <8 x i16> %b) {
	; SSE2-LABEL: shuffle_v8i16_34567012:
	; SSE2: # BB#0:
	; SSE2-NEXT: movdqa %xmm0, %xmm1
	; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
	; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
	; SSE2-NEXT: por %xmm1, %xmm0
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v8i16_34567012:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v8i16_34567012:
	; SSE41: # BB#0:
	; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_34567012:
	; AVX: # BB#0:
	; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_u456uu1u(<8 x i16> %a, <8 x i16> %b) {
	; SSE2-LABEL: shuffle_v8i16_u456uu1u:
	; SSE2: # BB#0:
	; SSE2-NEXT: movdqa %xmm0, %xmm1
	; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
	; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
	; SSE2-NEXT: por %xmm1, %xmm0
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v8i16_u456uu1u:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v8i16_u456uu1u:
	; SSE41: # BB#0:
	; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_u456uu1u:
	; AVX: # BB#0:
	; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 4, i32 5, i32 6, i32 undef, i32 undef, i32 1, i32 undef>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_u456uuuu(<8 x i16> %a, <8 x i16> %b) {
	; SSE-LABEL: shuffle_v8i16_u456uuuu:
	; SSE: # BB#0:
	; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
	; SSE-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_u456uuuu:
	; AVX: # BB#0:
	; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 4, i32 5, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_3456789a(<8 x i16> %a, <8 x i16> %b) {
	; SSE2-LABEL: shuffle_v8i16_3456789a:
	; SSE2: # BB#0:
	; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
	; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5]
	; SSE2-NEXT: por %xmm1, %xmm0
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v8i16_3456789a:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5]
	; SSSE3-NEXT: movdqa %xmm1, %xmm0
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v8i16_3456789a:
	; SSE41: # BB#0:
	; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5]
	; SSE41-NEXT: movdqa %xmm1, %xmm0
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_3456789a:
	; AVX: # BB#0:
	; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_u456uu9u(<8 x i16> %a, <8 x i16> %b) {
	; SSE2-LABEL: shuffle_v8i16_u456uu9u:
	; SSE2: # BB#0:
	; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
	; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5]
	; SSE2-NEXT: por %xmm1, %xmm0
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v8i16_u456uu9u:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5]
	; SSSE3-NEXT: movdqa %xmm1, %xmm0
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v8i16_u456uu9u:
	; SSE41: # BB#0:
	; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5]
	; SSE41-NEXT: movdqa %xmm1, %xmm0
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_u456uu9u:
	; AVX: # BB#0:
	; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 4, i32 5, i32 6, i32 undef, i32 undef, i32 9, i32 undef>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_56789abc(<8 x i16> %a, <8 x i16> %b) {
	; SSE2-LABEL: shuffle_v8i16_56789abc:
	; SSE2: # BB#0:
	; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
	; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9]
	; SSE2-NEXT: por %xmm1, %xmm0
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v8i16_56789abc:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9]
	; SSSE3-NEXT: movdqa %xmm1, %xmm0
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v8i16_56789abc:
	; SSE41: # BB#0:
	; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9]
	; SSE41-NEXT: movdqa %xmm1, %xmm0
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_56789abc:
	; AVX: # BB#0:
	; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_u6uu9abu(<8 x i16> %a, <8 x i16> %b) {
	; SSE2-LABEL: shuffle_v8i16_u6uu9abu:
	; SSE2: # BB#0:
	; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
	; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9]
	; SSE2-NEXT: por %xmm1, %xmm0
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v8i16_u6uu9abu:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9]
	; SSSE3-NEXT: movdqa %xmm1, %xmm0
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v8i16_u6uu9abu:
	; SSE41: # BB#0:
	; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9]
	; SSE41-NEXT: movdqa %xmm1, %xmm0
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_u6uu9abu:
	; AVX: # BB#0:
	; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 6, i32 undef, i32 undef, i32 9, i32 10, i32 11, i32 undef>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_0uuu1uuu(<8 x i16> %a) {
	; SSE2-LABEL: shuffle_v8i16_0uuu1uuu:
	; SSE2: # BB#0:
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
	; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7]
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v8i16_0uuu1uuu:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
	; SSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v8i16_0uuu1uuu:
	; SSE41: # BB#0:
	; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_0uuu1uuu:
	; AVX: # BB#0:
	; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_0zzz1zzz(<8 x i16> %a) {
	; SSE2-LABEL: shuffle_v8i16_0zzz1zzz:
	; SSE2: # BB#0:
	; SSE2-NEXT: pxor %xmm1, %xmm1
	; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
	; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v8i16_0zzz1zzz:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: pxor %xmm1, %xmm1
	; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
	; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v8i16_0zzz1zzz:
	; SSE41: # BB#0:
	; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_0zzz1zzz:
	; AVX: # BB#0:
	; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_0u1u2u3u(<8 x i16> %a) {
	; SSE2-LABEL: shuffle_v8i16_0u1u2u3u:
	; SSE2: # BB#0:
	; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v8i16_0u1u2u3u:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v8i16_0u1u2u3u:
	; SSE41: # BB#0:
	; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_0u1u2u3u:
	; AVX: # BB#0:
	; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 3, i32 undef>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_0z1z2z3z(<8 x i16> %a) {
	; SSE2-LABEL: shuffle_v8i16_0z1z2z3z:
	; SSE2: # BB#0:
	; SSE2-NEXT: pxor %xmm1, %xmm1
	; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v8i16_0z1z2z3z:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: pxor %xmm1, %xmm1
	; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v8i16_0z1z2z3z:
	; SSE41: # BB#0:
	; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_0z1z2z3z:
	; AVX: # BB#0:
	; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15>
	ret <8 x i16> %shuffle
	}

	;
	; Shuffle to logical bit shifts
	;
	define <8 x i16> @shuffle_v8i16_z0z2z4z6(<8 x i16> %a) {
	; SSE-LABEL: shuffle_v8i16_z0z2z4z6:
	; SSE: # BB#0:
	; SSE-NEXT: pslld $16, %xmm0
	; SSE-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_z0z2z4z6:
	; AVX: # BB#0:
	; AVX-NEXT: vpslld $16, %xmm0, %xmm0
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32><i32 8, i32 0, i32 8, i32 2, i32 8, i32 4, i32 8, i32 6>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_zzz0zzz4(<8 x i16> %a) {
	; SSE-LABEL: shuffle_v8i16_zzz0zzz4:
	; SSE: # BB#0:
	; SSE-NEXT: psllq $48, %xmm0
	; SSE-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_zzz0zzz4:
	; AVX: # BB#0:
	; AVX-NEXT: vpsllq $48, %xmm0, %xmm0
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32><i32 8, i32 8, i32 8, i32 0, i32 8, i32 8, i32 8, i32 4>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_zz01zX4X(<8 x i16> %a) {
	; SSE-LABEL: shuffle_v8i16_zz01zX4X:
	; SSE: # BB#0:
	; SSE-NEXT: psllq $32, %xmm0
	; SSE-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_zz01zX4X:
	; AVX: # BB#0:
	; AVX-NEXT: vpsllq $32, %xmm0, %xmm0
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32><i32 8, i32 8, i32 0, i32 1, i32 8, i32 undef, i32 4, i32 undef>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_z0X2z456(<8 x i16> %a) {
	; SSE-LABEL: shuffle_v8i16_z0X2z456:
	; SSE: # BB#0:
	; SSE-NEXT: psllq $16, %xmm0
	; SSE-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_z0X2z456:
	; AVX: # BB#0:
	; AVX-NEXT: vpsllq $16, %xmm0, %xmm0
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32><i32 8, i32 0, i32 undef, i32 2, i32 8, i32 4, i32 5, i32 6>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_1z3zXz7z(<8 x i16> %a) {
	; SSE-LABEL: shuffle_v8i16_1z3zXz7z:
	; SSE: # BB#0:
	; SSE-NEXT: psrld $16, %xmm0
	; SSE-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_1z3zXz7z:
	; AVX: # BB#0:
	; AVX-NEXT: vpsrld $16, %xmm0, %xmm0
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32><i32 1, i32 8, i32 3, i32 8, i32 undef, i32 8, i32 7, i32 8>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_1X3z567z(<8 x i16> %a) {
	; SSE-LABEL: shuffle_v8i16_1X3z567z:
	; SSE: # BB#0:
	; SSE-NEXT: psrlq $16, %xmm0
	; SSE-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_1X3z567z:
	; AVX: # BB#0:
	; AVX-NEXT: vpsrlq $16, %xmm0, %xmm0
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32><i32 1, i32 undef, i32 3, i32 8, i32 5, i32 6, i32 7, i32 8>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_23zz67zz(<8 x i16> %a) {
	; SSE-LABEL: shuffle_v8i16_23zz67zz:
	; SSE: # BB#0:
	; SSE-NEXT: psrlq $32, %xmm0
	; SSE-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_23zz67zz:
	; AVX: # BB#0:
	; AVX-NEXT: vpsrlq $32, %xmm0, %xmm0
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32><i32 2, i32 3, i32 8, i32 8, i32 6, i32 7, i32 8, i32 8>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_3zXXXzzz(<8 x i16> %a) {
	; SSE-LABEL: shuffle_v8i16_3zXXXzzz:
	; SSE: # BB#0:
	; SSE-NEXT: psrlq $48, %xmm0
	; SSE-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_3zXXXzzz:
	; AVX: # BB#0:
	; AVX-NEXT: vpsrlq $48, %xmm0, %xmm0
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32><i32 3, i32 8, i32 undef, i32 undef, i32 undef, i32 8, i32 8, i32 8>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_01u3zzuz(<8 x i16> %a) {
	; SSE-LABEL: shuffle_v8i16_01u3zzuz:
	; SSE: # BB#0:
	; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
	; SSE-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_01u3zzuz:
	; AVX: # BB#0:
	; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 undef, i32 3, i32 8, i32 8, i32 undef, i32 8>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_0z234567(<8 x i16> %a) {
	; SSE2-LABEL: shuffle_v8i16_0z234567:
	; SSE2: # BB#0:
	; SSE2-NEXT: andps {{.*}}(%rip), %xmm0
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v8i16_0z234567:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v8i16_0z234567:
	; SSE41: # BB#0:
	; SSE41-NEXT: pxor %xmm1, %xmm1
	; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_0z234567:
	; AVX: # BB#0:
	; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_0zzzz5z7(<8 x i16> %a) {
	; SSE2-LABEL: shuffle_v8i16_0zzzz5z7:
	; SSE2: # BB#0:
	; SSE2-NEXT: andps {{.*}}(%rip), %xmm0
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v8i16_0zzzz5z7:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v8i16_0zzzz5z7:
	; SSE41: # BB#0:
	; SSE41-NEXT: pxor %xmm1, %xmm1
	; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4],xmm0[5],xmm1[6],xmm0[7]
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_0zzzz5z7:
	; AVX: # BB#0:
	; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4],xmm0[5],xmm1[6],xmm0[7]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 8, i32 8, i32 8, i32 8, i32 5, i32 8, i32 7>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_0123456z(<8 x i16> %a) {
	; SSE2-LABEL: shuffle_v8i16_0123456z:
	; SSE2: # BB#0:
	; SSE2-NEXT: andps {{.*}}(%rip), %xmm0
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: shuffle_v8i16_0123456z:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: shuffle_v8i16_0123456z:
	; SSE41: # BB#0:
	; SSE41-NEXT: pxor %xmm1, %xmm1
	; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6],xmm1[7]
	; SSE41-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_0123456z:
	; AVX: # BB#0:
	; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6],xmm1[7]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_fu3ucc5u(<8 x i16> %a, <8 x i16> %b) {
	; SSE-LABEL: shuffle_v8i16_fu3ucc5u:
	; SSE: # BB#0:
	; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
	; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,4,4]
	; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
	; SSE-NEXT: movdqa %xmm1, %xmm0
	; SSE-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_fu3ucc5u:
	; AVX: # BB#0:
	; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
	; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,4,4]
	; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 15, i32 undef, i32 3, i32 undef, i32 12, i32 12, i32 5, i32 undef>
	ret <8 x i16> %shuffle
	}

	define <8 x i16> @shuffle_v8i16_8012345u(<8 x i16> %a) {
	; SSE-LABEL: shuffle_v8i16_8012345u:
	; SSE: # BB#0:
	; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
	; SSE-NEXT: retq
	;
	; AVX-LABEL: shuffle_v8i16_8012345u:
	; AVX: # BB#0:
	; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
	; AVX-NEXT: retq
	%shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 8, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 undef>

	ret <8 x i16> %shuffle
	}

	define <8 x i16> @mask_v8i16_012345ef(<8 x i16> %a, <8 x i16> %b) {
	; SSE2-LABEL: mask_v8i16_012345ef:
	; SSE2: # BB#0:
	; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
	; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
	; SSE2-NEXT: movaps %xmm1, %xmm0
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: mask_v8i16_012345ef:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
	; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
	; SSSE3-NEXT: movaps %xmm1, %xmm0
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: mask_v8i16_012345ef:
	; SSE41: # BB#0:
	; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
	; SSE41-NEXT: retq
	;
	; AVX1-LABEL: mask_v8i16_012345ef:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: mask_v8i16_012345ef:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
	; AVX2OR512VL-NEXT: retq
	%1 = bitcast <8 x i16> %a to <2 x i64>
	%2 = bitcast <8 x i16> %b to <2 x i64>
	%3 = and <2 x i64> %1, <i64 0, i64 -4294967296>
	%4 = and <2 x i64> %2, <i64 -1, i64 4294967295>
	%5 = or <2 x i64> %4, %3
	%6 = bitcast <2 x i64> %5 to <8 x i16>
	ret <8 x i16> %6
	}

	define <8 x i16> @insert_dup_mem_v8i16_i32(i32* %ptr) {
	; SSE-LABEL: insert_dup_mem_v8i16_i32:
	; SSE: # BB#0:
	; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
	; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
	; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
	; SSE-NEXT: retq
	;
	; AVX1-LABEL: insert_dup_mem_v8i16_i32:
	; AVX1: # BB#0:
	; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: insert_dup_mem_v8i16_i32:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpbroadcastw (%rdi), %xmm0
	; AVX2OR512VL-NEXT: retq
	%tmp = load i32, i32* %ptr, align 4
	%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
	%tmp2 = bitcast <4 x i32> %tmp1 to <8 x i16>
	%tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <8 x i32> zeroinitializer
	ret <8 x i16> %tmp3
	}

	define <8 x i16> @insert_dup_mem_v8i16_sext_i16(i16* %ptr) {
	; SSE2-LABEL: insert_dup_mem_v8i16_sext_i16:
	; SSE2: # BB#0:
	; SSE2-NEXT: movswl (%rdi), %eax
	; SSE2-NEXT: movd %eax, %xmm0
	; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: insert_dup_mem_v8i16_sext_i16:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: movswl (%rdi), %eax
	; SSSE3-NEXT: movd %eax, %xmm0
	; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: insert_dup_mem_v8i16_sext_i16:
	; SSE41: # BB#0:
	; SSE41-NEXT: movswl (%rdi), %eax
	; SSE41-NEXT: movd %eax, %xmm0
	; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
	; SSE41-NEXT: retq
	;
	; AVX1-LABEL: insert_dup_mem_v8i16_sext_i16:
	; AVX1: # BB#0:
	; AVX1-NEXT: movswl (%rdi), %eax
	; AVX1-NEXT: vmovd %eax, %xmm0
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: insert_dup_mem_v8i16_sext_i16:
	; AVX2: # BB#0:
	; AVX2-NEXT: movswl (%rdi), %eax
	; AVX2-NEXT: vmovd %eax, %xmm0
	; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: insert_dup_mem_v8i16_sext_i16:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: movswl (%rdi), %eax
	-; AVX512VL-NEXT: vpbroadcastw %ax, %xmm0
	+; AVX512VL-NEXT: vpbroadcastw %eax, %xmm0
	; AVX512VL-NEXT: retq
	%tmp = load i16, i16* %ptr, align 2
	%tmp1 = sext i16 %tmp to i32
	%tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0
	%tmp3 = bitcast <4 x i32> %tmp2 to <8 x i16>
	%tmp4 = shufflevector <8 x i16> %tmp3, <8 x i16> undef, <8 x i32> zeroinitializer
	ret <8 x i16> %tmp4
	}

	define <8 x i16> @insert_dup_elt1_mem_v8i16_i32(i32* %ptr) {
	; SSE-LABEL: insert_dup_elt1_mem_v8i16_i32:
	; SSE: # BB#0:
	; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
	; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
	; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
	; SSE-NEXT: retq
	;
	; AVX1-LABEL: insert_dup_elt1_mem_v8i16_i32:
	; AVX1: # BB#0:
	; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: insert_dup_elt1_mem_v8i16_i32:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpbroadcastw 2(%rdi), %xmm0
	; AVX2OR512VL-NEXT: retq
	%tmp = load i32, i32* %ptr, align 4
	%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
	%tmp2 = bitcast <4 x i32> %tmp1 to <8 x i16>
	%tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
	ret <8 x i16> %tmp3
	}

	define <8 x i16> @insert_dup_elt3_mem_v8i16_i32(i32* %ptr) {
	; SSE2-LABEL: insert_dup_elt3_mem_v8i16_i32:
	; SSE2: # BB#0:
	; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: insert_dup_elt3_mem_v8i16_i32:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
	; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: insert_dup_elt3_mem_v8i16_i32:
	; SSE41: # BB#0:
	; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
	; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
	; SSE41-NEXT: retq
	;
	; AVX1-LABEL: insert_dup_elt3_mem_v8i16_i32:
	; AVX1: # BB#0:
	; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: insert_dup_elt3_mem_v8i16_i32:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpbroadcastw 2(%rdi), %xmm0
	; AVX2OR512VL-NEXT: retq
	%tmp = load i32, i32* %ptr, align 4
	%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 1
	%tmp2 = bitcast <4 x i32> %tmp1 to <8 x i16>
	%tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
	ret <8 x i16> %tmp3
	}

	define <8 x i16> @insert_dup_elt1_mem_v8i16_sext_i16(i16* %ptr) {
	; SSE2-LABEL: insert_dup_elt1_mem_v8i16_sext_i16:
	; SSE2: # BB#0:
	; SSE2-NEXT: movswl (%rdi), %eax
	; SSE2-NEXT: movd %eax, %xmm0
	; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: insert_dup_elt1_mem_v8i16_sext_i16:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: movswl (%rdi), %eax
	; SSSE3-NEXT: movd %eax, %xmm0
	; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: insert_dup_elt1_mem_v8i16_sext_i16:
	; SSE41: # BB#0:
	; SSE41-NEXT: movswl (%rdi), %eax
	; SSE41-NEXT: movd %eax, %xmm0
	; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
	; SSE41-NEXT: retq
	;
	; AVX1-LABEL: insert_dup_elt1_mem_v8i16_sext_i16:
	; AVX1: # BB#0:
	; AVX1-NEXT: movswl (%rdi), %eax
	; AVX1-NEXT: vmovd %eax, %xmm0
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: insert_dup_elt1_mem_v8i16_sext_i16:
	; AVX2: # BB#0:
	; AVX2-NEXT: movswl (%rdi), %eax
	; AVX2-NEXT: shrl $16, %eax
	; AVX2-NEXT: vmovd %eax, %xmm0
	; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: insert_dup_elt1_mem_v8i16_sext_i16:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: movswl (%rdi), %eax
	; AVX512VL-NEXT: shrl $16, %eax
	-; AVX512VL-NEXT: vpbroadcastw %ax, %xmm0
	+; AVX512VL-NEXT: vpbroadcastw %eax, %xmm0
	; AVX512VL-NEXT: retq
	%tmp = load i16, i16* %ptr, align 2
	%tmp1 = sext i16 %tmp to i32
	%tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0
	%tmp3 = bitcast <4 x i32> %tmp2 to <8 x i16>
	%tmp4 = shufflevector <8 x i16> %tmp3, <8 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
	ret <8 x i16> %tmp4
	}

	define <8 x i16> @insert_dup_elt3_mem_v8i16_sext_i16(i16* %ptr) {
	; SSE2-LABEL: insert_dup_elt3_mem_v8i16_sext_i16:
	; SSE2: # BB#0:
	; SSE2-NEXT: movswl (%rdi), %eax
	; SSE2-NEXT: movd %eax, %xmm0
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
	; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
	; SSE2-NEXT: retq
	;
	; SSSE3-LABEL: insert_dup_elt3_mem_v8i16_sext_i16:
	; SSSE3: # BB#0:
	; SSSE3-NEXT: movswl (%rdi), %eax
	; SSSE3-NEXT: movd %eax, %xmm0
	; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
	; SSSE3-NEXT: retq
	;
	; SSE41-LABEL: insert_dup_elt3_mem_v8i16_sext_i16:
	; SSE41: # BB#0:
	; SSE41-NEXT: movswl (%rdi), %eax
	; SSE41-NEXT: movd %eax, %xmm0
	; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
	; SSE41-NEXT: retq
	;
	; AVX1-LABEL: insert_dup_elt3_mem_v8i16_sext_i16:
	; AVX1: # BB#0:
	; AVX1-NEXT: movswl (%rdi), %eax
	; AVX1-NEXT: vmovd %eax, %xmm0
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: insert_dup_elt3_mem_v8i16_sext_i16:
	; AVX2: # BB#0:
	; AVX2-NEXT: movswl (%rdi), %eax
	; AVX2-NEXT: shrl $16, %eax
	; AVX2-NEXT: vmovd %eax, %xmm0
	; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: insert_dup_elt3_mem_v8i16_sext_i16:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: movswl (%rdi), %eax
	; AVX512VL-NEXT: shrl $16, %eax
	-; AVX512VL-NEXT: vpbroadcastw %ax, %xmm0
	+; AVX512VL-NEXT: vpbroadcastw %eax, %xmm0
	; AVX512VL-NEXT: retq
	%tmp = load i16, i16* %ptr, align 2
	%tmp1 = sext i16 %tmp to i32
	%tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 1
	%tmp3 = bitcast <4 x i32> %tmp2 to <8 x i16>
	%tmp4 = shufflevector <8 x i16> %tmp3, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
	ret <8 x i16> %tmp4
	}
	diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll
	index 6f5d916f2294..ba7c0894b932 100644
	--- a/test/CodeGen/X86/vector-shuffle-256-v16.ll
	+++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll
	@@ -1,4118 +1,4118 @@
	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx \| FileCheck %s --check-prefix=ALL --check-prefix=AVX1
	; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 \| FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX2
	; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw \| FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX512VL

	define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpbroadcastw %xmm0, %ymm0
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
	; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
	; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
	; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,8,9]
	; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,10,11,0,1]
	; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,12,13,0,1,0,1]
	; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,14,15,0,1,0,1,0,1]
	; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,0,1,0,1,0,1,2,3]
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1
	; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
	; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
	; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
	; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
	; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm1 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
	; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,0,1,0,1,6,7,0,1]
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00:
	; AVX2: # BB#0:
	; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
	; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
	; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
	; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm1 = [0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0]
	; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 9, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,0,1,10,11,0,1,0,1]
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00:
	; AVX2: # BB#0:
	; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
	; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
	; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm1 = [0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0]
	; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,14,15,0,1,0,1,0,1]
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00:
	; AVX2: # BB#0:
	; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
	; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
	; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm1 = [0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0]
	; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 11, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
	; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm1 = [0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0]
	; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 12, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
	; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm1 = [0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0]
	; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 13, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
	; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm1 = [0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
	; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 14, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
	; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: movl $15, %eax
	; AVX512VL-NEXT: vmovd %eax, %xmm1
	; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
	; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,7,7,7]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15]
	; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
	; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
	; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
	; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[3,3,3,3,4,5,6,7]
	; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
	; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15]
	; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7, i32 11, i32 11, i32 11, i32 11, i32 15, i32 15, i32 15, i32 15>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,2,4,5,6,7]
	; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,6]
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
	; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,6]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,2,4,5,6,7,8,8,10,10,12,13,14,15]
	; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,6,6,8,9,10,11,12,12,14,14]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[1,1,3,3,4,5,6,7]
	; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,7,7]
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,3,3,4,5,6,7]
	; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,7,7]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15]
	; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,5,7,7,8,9,10,11,13,13,15,15]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
	; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
	; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
	; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
	; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
	; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
	; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
	; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
	; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
	; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
	; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
	; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
	; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
	; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
	; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
	; AVX1: # BB#0:
	; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,0,65535,0,65535,0,65535,0,65535,0,65535,0,65535,0,65535,0]
	; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1
	; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
	; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_16_01_18_03_20_05_22_07_24_09_26_11_28_13_30_15(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_24_09_26_11_28_13_30_15:
	; AVX1: # BB#0:
	; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,0,65535,0,65535,0,65535,0,65535,0,65535,0,65535,0,65535,0]
	; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0
	; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
	; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_24_09_26_11_28_13_30_15:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 1, i32 18, i32 3, i32 20, i32 5, i32 22, i32 7, i32 24, i32 9, i32 26, i32 11, i32 28, i32 13, i32 30, i32 15>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_01_18_19_04_05_22_23_08_09_26_27_12_13_30_31(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_01_18_19_04_05_22_23_08_09_26_27_12_13_30_31:
	; AVX1: # BB#0:
	; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_00_01_18_19_04_05_22_23_08_09_26_27_12_13_30_31:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 4, i32 5, i32 22, i32 23, i32 8, i32 9, i32 26, i32 27, i32 12, i32 13, i32 30, i32 31>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_16_17_18_19_04_05_06_07_24_25_26_27_12_13_14_15(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_16_17_18_19_04_05_06_07_24_25_26_27_12_13_14_15:
	; AVX1: # BB#0:
	; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3]
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_16_17_18_19_04_05_06_07_24_25_26_27_12_13_14_15:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 12, i32 13, i32 14, i32 15>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31:
	; AVX1: # BB#0:
	; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0]
	; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1
	; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
	; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31:
	; AVX2: # BB#0:
	; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0]
	; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: movw $-32768, %ax # imm = 0x8000
	; AVX512VL-NEXT: kmovd %eax, %k1
	; AVX512VL-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1}
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 31>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15:
	; AVX1: # BB#0:
	; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
	; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1
	; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
	; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15:
	; AVX2: # BB#0:
	; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
	; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: movw $1, %ax
	; AVX512VL-NEXT: kmovd %eax, %k1
	; AVX512VL-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1}
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_15(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_15:
	; AVX1: # BB#0:
	; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,0,65535,0,65535,0,65535,0,0,65535,0,65535,0,65535,0,65535]
	; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1
	; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
	; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_15:
	; AVX2: # BB#0:
	; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,0,0,255,255,0,0,255,255,0,0,0,0,255,255,0,0,255,255,0,0,255,255,0,0,255,255]
	; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_15:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: movw $21930, %ax # imm = 0x55AA
	; AVX512VL-NEXT: kmovd %eax, %k1
	; AVX512VL-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1}
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 24, i32 9, i32 26, i32 11, i32 28, i32 13, i32 30, i32 15>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_31(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_31:
	; AVX1: # BB#0:
	; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [0,65535,0,65535,0,65535,0,65535,65535,0,65535,0,65535,0,65535,0]
	; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1
	; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
	; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_31:
	; AVX2: # BB#0:
	; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,0,0,255,255,0,0,255,255,0,0,255,255,255,255,0,0,255,255,0,0,255,255,0,0,255,255,0,0]
	; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_31:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: movw $-21931, %ax # imm = 0xAA55
	; AVX512VL-NEXT: kmovd %eax, %k1
	; AVX512VL-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1}
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 1, i32 18, i32 3, i32 20, i32 5, i32 22, i32 7, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_01_18_19_20_21_06_07_08_09_26_27_12_13_30_31(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_01_18_19_20_21_06_07_08_09_26_27_12_13_30_31:
	; AVX1: # BB#0:
	; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6],ymm1[7]
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_00_01_18_19_20_21_06_07_08_09_26_27_12_13_30_31:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6],ymm1[7]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 20, i32 21, i32 6, i32 7, i32 8, i32 9, i32 26, i32 27, i32 12, i32 13, i32 30, i32 31>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_16_00_16_00_16_00_16_00_16_00_16_00_16_00_16(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_00_16_00_16_00_16_00_16:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_00_16_00_16_00_16_00_16:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
	; AVX2OR512VL-NEXT: vpbroadcastd %xmm0, %ymm0
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
	; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
	; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,1,1,4,4,5,5]
	; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,16,0,16,0,16,0,16,8,24,8,24,8,24,8,24]
	; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 8, i32 24, i32 8, i32 24, i32 8, i32 24, i32 8, i32 24>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_16_16_16_16_04_05_06_07_24_24_24_24_12_13_14_15(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_16_16_16_16_04_05_06_07_24_24_24_24_12_13_14_15:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7]
	; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
	; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_16_16_16_16_04_05_06_07_24_24_24_24_12_13_14_15:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
	; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_16_16_16_16_04_05_06_07_24_24_24_24_12_13_14_15:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,0,0,0,20,21,22,23,8,8,8,8,28,29,30,31]
	; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
	; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 16, i32 16, i32 16, i32 4, i32 5, i32 6, i32 7, i32 24, i32 24, i32 24, i32 24, i32 12, i32 13, i32 14, i32 15>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
	; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
	; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
	; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
	; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15]
	; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [3,2,1,0,23,22,21,20,11,10,9,8,31,30,29,28]
	; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
	; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 19, i32 18, i32 17, i32 16, i32 7, i32 6, i32 5, i32 4, i32 27, i32 26, i32 25, i32 24, i32 15, i32 14, i32 13, i32 12>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
	; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
	; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [12,13,8,9,4,5,0,1,14,15,10,11,6,7,2,3]
	; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
	; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
	; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15]
	; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
	; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12]
	; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [3,2,1,0,19,18,17,16,11,10,9,8,27,26,25,24]
	; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
	; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 19, i32 18, i32 17, i32 16, i32 3, i32 2, i32 1, i32 0, i32 27, i32 26, i32 25, i32 24, i32 11, i32 10, i32 9, i32 8>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
	; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1,16,17,16,17,16,17,16,17,16,17,16,17,18,19,16,17]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 8>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
	; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,16,17,16,17,16,17,20,21,16,17,16,17]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 10, i32 8, i32 8>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
	; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,16,17,22,23,16,17,16,17,16,17]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 11, i32 8, i32 8, i32 8>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_12_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_12_08_08_08_08:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
	; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_12_08_08_08_08:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1,16,17,16,17,16,17,24,25,16,17,16,17,16,17,16,17]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 12, i32 8, i32 8, i32 8, i32 8>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_13_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_13_08_08_08_08_08:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
	; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_13_08_08_08_08_08:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1,16,17,16,17,26,27,16,17,16,17,16,17,16,17,16,17]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 13, i32 8, i32 8, i32 8, i32 8, i32 8>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_06_00_00_00_00_00_00_08_14_08_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_08_14_08_08_08_08_08_08:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
	; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_08_14_08_08_08_08_08_08:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1,16,17,28,29,16,17,16,17,16,17,16,17,16,17,16,17]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 14, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_07_00_00_00_00_00_00_00_15_08_08_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_15_08_08_08_08_08_08_08:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
	; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_15_08_08_08_08_08_08_08:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1,30,31,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 15, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_16_01_17_02_18_03_19_08_24_09_25_10_26_11_27(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_08_24_09_25_10_26_11_27:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
	; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
	; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_08_24_09_25_10_26_11_27:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_04_20_05_21_06_22_07_23_12_28_13_29_14_30_15_31(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_23_12_28_13_29_14_30_15_31:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
	; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
	; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_23_12_28_13_29_14_30_15_31:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_16_01_17_02_18_03_19_12_28_13_29_14_30_15_31(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_12_28_13_29_14_30_15_31:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
	; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
	; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_12_28_13_29_14_30_15_31:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,0,1,u,u,2,3,u,u,4,5,u,u,6,7,u,u,24,25,u,u,26,27,u,u,28,29,u,u,30,31]
	; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,u,u,2,3,u,u,4,5,u,u,6,7,u,u,24,25,u,u,26,27,u,u,28,29,u,u,30,31,u,u]
	; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_12_28_13_29_14_30_15_31:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,16,1,17,2,18,3,19,12,28,13,29,14,30,15,31]
	; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_04_20_05_21_06_22_07_23_08_24_09_25_10_26_11_27(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_23_08_24_09_25_10_26_11_27:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
	; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
	; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_23_08_24_09_25_10_26_11_27:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,8,9,u,u,10,11,u,u,12,13,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23]
	; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,u,u,10,11,u,u,12,13,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23,u,u]
	; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_23_08_24_09_25_10_26_11_27:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [4,20,5,21,6,22,7,23,8,24,9,25,10,26,11,27]
	; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_08_09_08_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_09_08_08_08_08_08_08:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,0,1,0,1,0,1,0,1,0,1,0,1]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_09_08_08_08_08_08_08:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1,16,17,18,19,16,17,16,17,16,17,16,17,16,17,16,17]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 8, i32 9, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_10_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_10_08_08_08_08_08:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,4,5,0,1,0,1,0,1,0,1,0,1]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_10_08_08_08_08_08:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,20,21,16,17,16,17,16,17,16,17,16,17]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 8, i32 8, i32 10, i32 8, i32 8, i32 8, i32 8, i32 8>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_11_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_11_08_08_08_08:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,6,7,0,1,0,1,0,1,0,1]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_11_08_08_08_08:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,22,23,16,17,16,17,16,17,16,17]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 11, i32 8, i32 8, i32 8, i32 8>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_08_12_08_08_08(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_08_12_08_08_08:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,8,9,0,1,0,1,0,1]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_08_12_08_08_08:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,24,25,16,17,16,17,16,17]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 12, i32 8, i32 8, i32 8>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_08_08_08_13_08_08(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_08_08_08_13_08_08:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,10,11,0,1,0,1]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_08_08_08_13_08_08:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,26,27,16,17,16,17]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 13, i32 8, i32 8>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_06_00_00_00_00_00_00_08_08_08_08_08_08_14_08(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_08_08_08_08_08_08_14_08:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,12,13,0,1]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_08_08_08_08_08_08_14_08:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,28,29,16,17]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 14, i32 8>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_07_00_00_00_00_00_00_00_08_08_08_08_08_08_08_15(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_08_08_08_08_08_08_08_15:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,14,15]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_08_08_08_08_08_08_08_15:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,30,31]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 15>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_00_02_02_04_04_06_06_14_14_12_12_10_10_08_08(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_14_14_12_12_10_10_08_08:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,2,4,5,6,7]
	; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,6]
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,12,13,8,9,8,9,4,5,4,5,0,1,0,1]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_14_14_12_12_10_10_08_08:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13,28,29,28,29,24,25,24,25,20,21,20,21,16,17,16,17]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 14, i32 14, i32 12, i32 12, i32 10, i32 10, i32 8, i32 8>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_04_04_04_04_00_00_00_00_08_08_08_08_12_12_12_12(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_00_08_08_08_08_12_12_12_12:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1]
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
	; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_00_08_08_08_08_12_12_12_12:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_uu_uu_00_00_00_00_00_08_08_uu_uu_08_08_14_08(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_uu_uu_00_00_00_00_00_08_08_uu_uu_08_08_14_08:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,4,5,6,7,0,1,0,1,12,13,0,1]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_00_uu_uu_00_00_00_00_00_08_08_uu_uu_08_08_14_08:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,u,u,u,u,0,1,0,1,0,1,0,1,0,1,16,17,16,17,u,u,u,u,16,17,16,17,28,29,16,17]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 undef, i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 undef, i32 undef, i32 8, i32 8, i32 14, i32 8>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_07_uu_00_00_00_00_00_00_08_08_uu_uu_08_08_08_15(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_07_uu_00_00_00_00_00_00_08_08_uu_uu_08_08_08_15:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[14,15,2,3,0,1,0,1,0,1,0,1,0,1,0,1]
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,4,5,6,7,0,1,0,1,0,1,14,15]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_07_uu_00_00_00_00_00_00_08_08_uu_uu_08_08_08_15:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,u,u,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,u,u,u,u,16,17,16,17,16,17,30,31]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 undef, i32 undef, i32 8, i32 8, i32 8, i32 15>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_uu_uu_02_04_04_uu_06_14_14_uu_12_10_10_08_08(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_uu_uu_02_04_04_uu_06_14_14_uu_12_10_10_08_08:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,1,2,2,4,5,6,7]
	; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,6]
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,12,13,12,13,8,9,4,5,4,5,0,1,0,1]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_00_uu_uu_02_04_04_uu_06_14_14_uu_12_10_10_08_08:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,u,u,u,u,4,5,8,9,8,9,u,u,12,13,28,29,28,29,u,u,24,25,20,21,20,21,16,17,16,17]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 undef, i32 undef, i32 2, i32 4, i32 4, i32 undef, i32 6, i32 14, i32 14, i32 undef, i32 12, i32 10, i32 10, i32 8, i32 8>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_04_04_04_04_uu_uu_uu_uu_08_08_08_uu_uu_12_12_12(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_04_04_04_04_uu_uu_uu_uu_08_08_08_uu_uu_12_12_12:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,2,3]
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,3,4,5,6,7]
	; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_04_04_04_04_uu_uu_uu_uu_08_08_08_uu_uu_12_12_12:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,8,9,8,9,8,9,u,u,u,u,u,u,u,u,16,17,16,17,16,17,u,u,u,u,24,25,24,25,24,25]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 8, i32 8, i32 undef, i32 undef, i32 12, i32 12, i32 12>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_04_16_16_16_16_20_20_20_20(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_16_16_16_16_20_20_20_20:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
	; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
	; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_16_16_16_16_20_20_20_20:
	; AVX2: # BB#0:
	; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
	; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
	; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_16_16_16_16_20_20_20_20:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,0,0,0,4,4,4,4,16,16,16,16,20,20,20,20]
	; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 16, i32 16, i32 16, i32 16, i32 20, i32 20, i32 20, i32 20>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
	; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
	; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20:
	; AVX2: # BB#0:
	; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
	; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
	; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [8,8,8,8,12,12,12,12,16,16,16,16,20,20,20,20]
	; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12, i32 16, i32 16, i32 16, i32 16, i32 20, i32 20, i32 20, i32 20>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_08_08_08_08_12_12_12_12_24_24_24_24_28_28_28_28(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_24_24_24_24_28_28_28_28:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
	; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
	; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_24_24_24_24_28_28_28_28:
	; AVX2: # BB#0:
	; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
	; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
	; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_24_24_24_24_28_28_28_28:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [8,8,8,8,12,12,12,12,24,24,24,24,28,28,28,28]
	; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12, i32 24, i32 24, i32 24, i32 24, i32 28, i32 28, i32 28, i32 28>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_04_24_24_24_24_28_28_28_28(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_24_24_24_24_28_28_28_28:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
	; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
	; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_24_24_24_24_28_28_28_28:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
	; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
	; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_24_24_24_24_28_28_28_28:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,0,0,0,4,4,4,4,24,24,24,24,28,28,28,28]
	; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 24, i32 24, i32 24, i32 24, i32 28, i32 28, i32 28, i32 28>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
	; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
	; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
	; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23]
	; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_zz_zz_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_24(<16 x i16> %a) {
	; AVX1-LABEL: shuffle_v16i16_zz_zz_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_24:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_zz_zz_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_24:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> zeroinitializer, <16 x i16> %a, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 24>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_17_18_19_20_21_22_23_zz_25_26_27_28_29_30_31_zz(<16 x i16> %a) {
	; AVX1-LABEL: shuffle_v16i16_17_18_19_20_21_22_23_zz_25_26_27_28_29_30_31_zz:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpsrldq {{.*#+}} xmm1 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_17_18_19_20_21_22_23_zz_25_26_27_28_29_30_31_zz:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> zeroinitializer, <16 x i16> %a, <16 x i32> <i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_06_07_01_02_07_00_04_05_14_15_09_10_15_08_12_13(<16 x i16> %a) {
	; AVX1-LABEL: shuffle_v16i16_06_07_01_02_07_00_04_05_14_15_09_10_15_08_12_13:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,14,15,2,3,4,5,14,15,0,1,8,9,10,11]
	; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_06_07_01_02_07_00_04_05_14_15_09_10_15_08_12_13:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,14,15,2,3,4,5,14,15,0,1,8,9,10,11,28,29,30,31,18,19,20,21,30,31,16,17,24,25,26,27]
	; AVX2OR512VL-NEXT: retq
	%1 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 6, i32 7, i32 1, i32 2, i32 7, i32 0, i32 4, i32 5, i32 14, i32 15, i32 9, i32 10, i32 15, i32 8, i32 12, i32 13>
	ret <16 x i16> %1
	}

	;
	; Shuffle to logical bit shifts
	;

	define <16 x i16> @shuffle_v16i16_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14(<16 x i16> %a) {
	; AVX1-LABEL: shuffle_v16i16_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpslld $16, %xmm0, %xmm1
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpslld $16, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpslld $16, %ymm0, %ymm0
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> <i32 16, i32 0, i32 16, i32 2, i32 16, i32 4, i32 16, i32 6, i32 16, i32 8, i32 16, i32 10, i32 16, i32 12, i32 16, i32 14>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12(<16 x i16> %a) {
	; AVX1-LABEL: shuffle_v16i16_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpsllq $48, %xmm0, %xmm1
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpsllq $48, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpsllq $48, %ymm0, %ymm0
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> <i32 16, i32 16, i32 16, i32 0, i32 16, i32 16, i32 16, i32 4, i32 16, i32 16, i32 16, i32 8, i32 16, i32 16, i32 16, i32 12>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz(<16 x i16> %a) {
	; AVX1-LABEL: shuffle_v16i16_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpsrld $16, %ymm0, %ymm0
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> <i32 1, i32 16, i32 3, i32 16, i32 5, i32 16, i32 7, i32 16, i32 9, i32 16, i32 11, i32 16, i32 13, i32 16, i32 15, i32 16>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz(<16 x i16> %a) {
	; AVX1-LABEL: shuffle_v16i16_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz:
	; AVX1: # BB#0:
	; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1
	; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
	; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpsrlq $32, %ymm0, %ymm0
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> <i32 2, i32 3, i32 16, i32 16, i32 6, i32 7, i32 16, i32 16, i32 10, i32 11, i32 16, i32 16, i32 14, i32 15, i32 16, i32 16>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_16_zz_zz_zz_17_zz_zz_zz_18_zz_zz_zz_19_zz_zz_zz(<16 x i16> %a) {
	; AVX1-LABEL: shuffle_v16i16_16_zz_zz_zz_17_zz_zz_zz_18_zz_zz_zz_19_zz_zz_zz:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
	; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
	; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_16_zz_zz_zz_17_zz_zz_zz_18_zz_zz_zz_19_zz_zz_zz:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> zeroinitializer, <16 x i16> %a, <16 x i32> <i32 16, i32 0, i32 0, i32 0, i32 17, i32 0, i32 0, i32 0, i32 18, i32 0, i32 0, i32 0, i32 19, i32 0, i32 0, i32 0>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_16_zz_17_zz_18_zz_19_zz_20_zz_21_zz_22_zz_22_zz(<16 x i16> %a) {
	; AVX1-LABEL: shuffle_v16i16_16_zz_17_zz_18_zz_19_zz_20_zz_21_zz_22_zz_22_zz:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
	; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_16_zz_17_zz_18_zz_19_zz_20_zz_21_zz_22_zz_22_zz:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> zeroinitializer, <16 x i16> %a, <16 x i32> <i32 16, i32 0, i32 17, i32 0, i32 18, i32 0, i32 19, i32 0, i32 20, i32 0, i32 21, i32 0, i32 22, i32 0, i32 23, i32 0>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_28_zz_zz_zz_29_zz_zz_zz_30_zz_zz_zz_31_zz_zz_zz(<16 x i16> %a) {
	; AVX1-LABEL: shuffle_v16i16_28_zz_zz_zz_29_zz_zz_zz_30_zz_zz_zz_31_zz_zz_zz:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
	; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
	; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
	; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_28_zz_zz_zz_29_zz_zz_zz_30_zz_zz_zz_31_zz_zz_zz:
	; AVX2: # BB#0:
	; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
	; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
	; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
	; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
	; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
	; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
	; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
	; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_28_zz_zz_zz_29_zz_zz_zz_30_zz_zz_zz_31_zz_zz_zz:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [28,1,2,3,29,5,6,7,30,9,10,11,31,13,14,15]
	; AVX512VL-NEXT: vpxor %ymm1, %ymm1, %ymm1
	; AVX512VL-NEXT: vpermt2w %ymm0, %ymm2, %ymm1
	; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> zeroinitializer, <16 x i16> %a, <16 x i32> <i32 28, i32 0, i32 0, i32 0, i32 29, i32 0, i32 0, i32 0, i32 30, i32 0, i32 0, i32 0, i32 31, i32 0, i32 0, i32 0>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_23_00_01_02_03_04_05_06_31_08_09_10_11_12_13_14(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_23_00_01_02_03_04_05_06_31_08_09_10_11_12_13_14:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
	; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
	; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_23_00_01_02_03_04_05_06_31_08_09_10_11_12_13_14:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13],ymm1[30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 31, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_01_02_03_04_05_06_07_16_09_10_11_12_13_14_15_24(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_01_02_03_04_05_06_07_16_09_10_11_12_13_14_15_24:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
	; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1]
	; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1]
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_01_02_03_04_05_06_07_16_09_10_11_12_13_14_15_24:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1],ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 24>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_17_18_19_20_21_22_23_00_25_26_27_28_29_30_31_8(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_17_18_19_20_21_22_23_00_25_26_27_28_29_30_31_8:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
	; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1]
	; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1]
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_17_18_19_20_21_22_23_00_25_26_27_28_29_30_31_8:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1],ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 00, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 8>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_07_16_17_18_19_20_21_22_15_24_25_26_27_28_29_30(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_07_16_17_18_19_20_21_22_15_24_25_26_27_28_29_30:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
	; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
	; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_07_16_17_18_19_20_21_22_15_24_25_26_27_28_29_30:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13],ymm0[30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27,28,29]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 15, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_01_02_03_04_05_06_07_00_17_18_19_20_21_22_23_16(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_01_02_03_04_05_06_07_00_17_18_19_20_21_22_23_16:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1]
	; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1]
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_01_02_03_04_05_06_07_00_17_18_19_20_21_22_23_16:
	; AVX2: # BB#0:
	; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
	; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,18,19,20,21,22,23,24,25,26,27,28,29,30,31,16,17]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_01_02_03_04_05_06_07_00_17_18_19_20_21_22_23_16:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [1,2,3,4,5,6,7,0,17,18,19,20,21,22,23,16]
	; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 16>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_07_00_01_02_03_04_05_06_23_16_17_18_19_20_21_22(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_07_00_01_02_03_04_05_06_23_16_17_18_19_20_21_22:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13]
	; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13]
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_07_00_01_02_03_04_05_06_23_16_17_18_19_20_21_22:
	; AVX2: # BB#0:
	; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
	; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,30,31,16,17,18,19,20,21,22,23,24,25,26,27,28,29]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_07_00_01_02_03_04_05_06_23_16_17_18_19_20_21_22:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [7,0,1,2,3,4,5,6,23,16,17,18,19,20,21,22]
	; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 23, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_01_00_01_02_03_02_11_08_09_08_09_10_11_10_11(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_01_00_01_02_03_02_11_08_09_08_09_10_11_10_11:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,0,2,4,5,6,7]
	; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_00_01_00_01_02_03_02_11_08_09_08_09_10_11_10_11:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
	; AVX2OR512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
	; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,0,2,4,5,6,7]
	; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
	; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
	; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 2, i32 3, i32 2, i32 11, i32 8, i32 9, i32 8, i32 9, i32 10, i32 11, i32 10, i32 11>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_06_07_04_05_02_03_00_09_14_15_12_13_10_11_08_09(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_06_07_04_05_02_03_00_09_14_15_12_13_10_11_08_09:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,1,0]
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_06_07_04_05_02_03_00_09_14_15_12_13_10_11_08_09:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
	; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
	; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
	; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,1,0]
	; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 6, i32 7, i32 4, i32 5, i32 2, i32 3, i32 0, i32 9, i32 14, i32 15, i32 12, i32 13, i32 10, i32 11, i32 8, i32 9>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_04_05_06_07_16_17_18_27_12_13_14_15_24_25_26_27(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_04_05_06_07_16_17_18_27_12_13_14_15_24_25_26_27:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
	; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
	; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,14,15]
	; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_04_05_06_07_16_17_18_27_12_13_14_15_24_25_26_27:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
	; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
	; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
	; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
	; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
	; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_04_05_06_07_16_17_18_27_12_13_14_15_24_25_26_27:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [4,5,6,7,16,17,18,27,12,13,14,15,24,25,26,27]
	; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 27, i32 12, i32 13, i32 14, i32 15, i32 24, i32 25, i32 26, i32 27>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_08:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,2,3]
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_08:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
	; AVX2OR512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,2,3]
	; AVX2OR512VL-NEXT: vpbroadcastw %xmm1, %xmm1
	; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vpsllq $48, %xmm1, %xmm2
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
	; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,7]
	; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
	; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
	; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2
	; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
	; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,7]
	; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
	; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
	; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
	; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 12, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_uu_00_uu_01_uu_02_uu_11_uu_08_uu_09_uu_10_uu_11(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_uu_00_uu_01_uu_02_uu_11_uu_08_uu_09_uu_10_uu_11:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
	; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
	; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_uu_00_uu_01_uu_02_uu_11_uu_08_uu_09_uu_10_uu_11:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
	; AVX2OR512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
	; AVX2OR512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
	; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
	; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
	; AVX2OR512VL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 11, i32 undef, i32 8, i32 undef, i32 9, i32 undef, i32 10, i32 undef, i32 11>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_uu_04_uu_05_uu_06_uu_15_uu_12_uu_13_uu_14_uu_15(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_uu_04_uu_05_uu_06_uu_15_uu_12_uu_13_uu_14_uu_15:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
	; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
	; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_uu_04_uu_05_uu_06_uu_15_uu_12_uu_13_uu_14_uu_15:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
	; AVX2OR512VL-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
	; AVX2OR512VL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
	; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
	; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
	; AVX2OR512VL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 4, i32 undef, i32 5, i32 undef, i32 6, i32 undef, i32 15, i32 undef, i32 12, i32 undef, i32 13, i32 undef, i32 14, i32 undef, i32 15>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_13(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_13:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2]
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_13:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
	; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
	; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7]
	; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
	; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7]
	; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2]
	; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 1, i32 2, i32 0, i32 6, i32 7, i32 4, i32 13, i32 11, i32 9, i32 10, i32 8, i32 14, i32 15, i32 12, i32 13>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_04_04_04_04_00_00_00_08_12_12_12_12_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_08_12_12_12_12_08_08_08_08:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,14,15]
	; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1]
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_08_12_12_12_12_08_08_08_08:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
	; AVX2OR512VL-NEXT: vpbroadcastw %xmm1, %xmm2
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,14,15]
	; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1]
	; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 0, i32 0, i32 0, i32 8, i32 12, i32 12, i32 12, i32 12, i32 8, i32 8, i32 8, i32 8>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_02_03_00_01_06_07_04_13_10_11_08_09_14_15_12_13(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_13_10_11_08_09_14_15_12_13:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,3,2]
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_13_10_11_08_09_14_15_12_13:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
	; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
	; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2]
	; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,3,2]
	; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 13, i32 10, i32 11, i32 8, i32 9, i32 14, i32 15, i32 12, i32 13>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_13(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_13:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,3,0,2,4,5,6,7]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,3,0,2,4,5,6,7]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2]
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_13:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
	; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
	; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,3,0,2,4,5,6,7]
	; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
	; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,3,0,2,4,5,6,7]
	; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2]
	; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 2, i32 3, i32 0, i32 2, i32 6, i32 7, i32 4, i32 13, i32 10, i32 11, i32 8, i32 10, i32 14, i32 15, i32 12, i32 13>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_15(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_15:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2]
	; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,3]
	; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,7,4,7]
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_15:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
	; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2]
	; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
	; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,3]
	; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,7,4,7]
	; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 15, i32 10, i32 11, i32 8, i32 9, i32 14, i32 15, i32 12, i32 15>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_07_05_06_04_03_01_02_08_15_13_14_12_11_09_10_08(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_07_05_06_04_03_01_02_08_15_13_14_12_11_09_10_08:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [14,15,10,11,12,13,8,9,6,7,2,3,4,5,0,1]
	; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm3
	; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
	; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_07_05_06_04_03_01_02_08_15_13_14_12_11_09_10_08:
	; AVX2: # BB#0:
	; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
	; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [14,15,10,11,12,13,8,9,6,7,2,3,4,5,0,1]
	; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm3
	; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
	; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
	; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_07_05_06_04_03_01_02_08_15_13_14_12_11_09_10_08:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
	; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [14,15,10,11,12,13,8,9,6,7,2,3,4,5,0,1]
	; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3
	; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
	; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
	; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 5, i32 6, i32 4, i32 3, i32 1, i32 2, i32 8, i32 15, i32 13, i32 14, i32 12, i32 11, i32 9, i32 10, i32 8>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_01_00_05_04_05_04_01_08_09_08_13_12_13_12_09_08(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_01_00_05_04_05_04_01_08_09_08_13_12_13_12_09_08:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,10,11,8,9,10,11,8,9,2,3,2,3]
	; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,0,1,10,11,8,9,10,11,8,9,2,3,0,1]
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_01_00_05_04_05_04_01_08_09_08_13_12_13_12_09_08:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
	; AVX2OR512VL-NEXT: vpbroadcastw %xmm1, %xmm2
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,10,11,8,9,10,11,8,9,2,3,2,3]
	; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,0,1,10,11,8,9,10,11,8,9,2,3,0,1]
	; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 1, i32 0, i32 5, i32 4, i32 5, i32 4, i32 1, i32 8, i32 9, i32 8, i32 13, i32 12, i32 13, i32 12, i32 9, i32 8>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_05_04_01_00_05_04_01_08_13_12_09_08_13_12_09_08(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_05_04_01_00_05_04_01_08_13_12_09_08_13_12_09_08:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,10,11,8,9,2,3,2,3]
	; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,8,9,2,3,0,1,10,11,8,9,2,3,0,1]
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_05_04_01_00_05_04_01_08_13_12_09_08_13_12_09_08:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
	; AVX2OR512VL-NEXT: vpbroadcastw %xmm1, %xmm2
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,10,11,8,9,2,3,2,3]
	; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,8,9,2,3,0,1,10,11,8,9,2,3,0,1]
	; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 5, i32 4, i32 1, i32 0, i32 5, i32 4, i32 1, i32 8, i32 13, i32 12, i32 9, i32 8, i32 13, i32 12, i32 9, i32 8>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_05_04_01_00_01_00_05_12_13_12_09_08_09_08_13_12(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_05_04_01_00_01_00_05_12_13_12_09_08_09_08_13_12:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vpsllq $48, %xmm1, %xmm2
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,2,3,0,1,10,11,2,3]
	; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,8,9,2,3,0,1,2,3,0,1,10,11,8,9]
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_05_04_01_00_01_00_05_12_13_12_09_08_09_08_13_12:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
	; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,2,3,0,1,10,11,2,3]
	; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,8,9,2,3,0,1,2,3,0,1,10,11,8,9]
	; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 5, i32 4, i32 1, i32 0, i32 1, i32 0, i32 5, i32 12, i32 13, i32 12, i32 9, i32 8, i32 9, i32 8, i32 13, i32 12>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_04_04_00_00_04_04_08_08_12_12_08_08_12_12_08(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_04_04_00_00_04_04_08_08_12_12_08_08_12_12_08:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,8,9,8,9,2,3]
	; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,0,1,8,9,8,9,0,1]
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_00_04_04_00_00_04_04_08_08_12_12_08_08_12_12_08:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
	; AVX2OR512VL-NEXT: vpbroadcastw %xmm1, %xmm2
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,8,9,8,9,2,3]
	; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,0,1,8,9,8,9,0,1]
	; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 4, i32 4, i32 0, i32 0, i32 4, i32 4, i32 8, i32 8, i32 12, i32 12, i32 8, i32 8, i32 12, i32 12, i32 8>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_04_00_00_04_04_00_00_12_12_08_08_12_12_08_08_12(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_04_00_00_04_04_00_00_12_12_08_08_12_12_08_08_12:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vpsllq $48, %xmm1, %xmm2
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,0,1,0,1,8,9,8,9,0,1,0,1,2,3]
	; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,0,1,0,1,8,9,8,9,0,1,0,1,8,9]
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_04_00_00_04_04_00_00_12_12_08_08_12_12_08_08_12:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
	; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,0,1,0,1,8,9,8,9,0,1,0,1,2,3]
	; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,0,1,0,1,8,9,8,9,0,1,0,1,8,9]
	; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 0, i32 0, i32 4, i32 4, i32 0, i32 0, i32 12, i32 12, i32 8, i32 8, i32 12, i32 12, i32 8, i32 8, i32 12>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_02_06_04_00_05_01_07_11_10_14_12_08_13_09_15_11(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_02_06_04_00_05_01_07_11_10_14_12_08_13_09_15_11:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,12,13,8,9,0,1,10,11,2,3,14,15,6,7]
	; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm3
	; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
	; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_02_06_04_00_05_01_07_11_10_14_12_08_13_09_15_11:
	; AVX2: # BB#0:
	; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
	; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,12,13,8,9,0,1,10,11,2,3,14,15,6,7]
	; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm3
	; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
	; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
	; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_02_06_04_00_05_01_07_11_10_14_12_08_13_09_15_11:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
	; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [4,5,12,13,8,9,0,1,10,11,2,3,14,15,6,7]
	; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3
	; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
	; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
	; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 2, i32 6, i32 4, i32 0, i32 5, i32 1, i32 7, i32 11, i32 10, i32 14, i32 12, i32 8, i32 13, i32 9, i32 15, i32 11>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_02_00_06_04_05_01_07_11_10_08_14_12_13_09_15_11(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_02_00_06_04_05_01_07_11_10_08_14_12_13_09_15_11:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,0,1,12,13,8,9,10,11,2,3,14,15,6,7]
	; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm3
	; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
	; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_02_00_06_04_05_01_07_11_10_08_14_12_13_09_15_11:
	; AVX2: # BB#0:
	; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
	; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,0,1,12,13,8,9,10,11,2,3,14,15,6,7]
	; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm3
	; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
	; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
	; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_02_00_06_04_05_01_07_11_10_08_14_12_13_09_15_11:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
	; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [4,5,0,1,12,13,8,9,10,11,2,3,14,15,6,7]
	; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3
	; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
	; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
	; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 2, i32 0, i32 6, i32 4, i32 5, i32 1, i32 7, i32 11, i32 10, i32 8, i32 14, i32 12, i32 13, i32 9, i32 15, i32 11>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_02_06_04_00_01_03_07_13_10_14_12_08_09_11_15_13(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_02_06_04_00_01_03_07_13_10_14_12_08_09_11_15_13:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,12,13,8,9,0,1,2,3,6,7,14,15,10,11]
	; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm3
	; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
	; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_02_06_04_00_01_03_07_13_10_14_12_08_09_11_15_13:
	; AVX2: # BB#0:
	; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
	; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,12,13,8,9,0,1,2,3,6,7,14,15,10,11]
	; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm3
	; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
	; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
	; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_02_06_04_00_01_03_07_13_10_14_12_08_09_11_15_13:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
	; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [4,5,12,13,8,9,0,1,2,3,6,7,14,15,10,11]
	; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3
	; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
	; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
	; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 2, i32 6, i32 4, i32 0, i32 1, i32 3, i32 7, i32 13, i32 10, i32 14, i32 12, i32 8, i32 9, i32 11, i32 15, i32 13>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7]
	; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm3
	; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
	; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11:
	; AVX2: # BB#0:
	; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
	; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7]
	; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm3
	; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
	; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
	; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
	; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7]
	; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3
	; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
	; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
	; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 6, i32 6, i32 7, i32 5, i32 1, i32 6, i32 4, i32 11, i32 14, i32 14, i32 15, i32 13, i32 9, i32 14, i32 12, i32 11>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_00_04_04_04_04_04_12_08_08_12_12_12_12_12_12(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_00_04_04_04_04_04_12_08_08_12_12_12_12_12_12:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vpsllq $48, %xmm1, %xmm2
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,8,9,8,9,8,9,8,9,8,9,14,15]
	; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,8,9,8,9,8,9,8,9,8,9,8,9]
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_04_04_04_04_04_12_08_08_12_12_12_12_12_12:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
	; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,8,9,8,9,8,9,8,9,8,9,14,15]
	; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,8,9,8,9,8,9,8,9,8,9,8,9]
	; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 4, i32 12, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_04_04_00_00_04_04_04_12_12_12_08_08_12_12_12_12(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_04_04_00_00_04_04_04_12_12_12_08_08_12_12_12_12:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vpsllq $48, %xmm1, %xmm2
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,0,1,0,1,8,9,8,9,8,9,14,15]
	; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,0,1,0,1,8,9,8,9,8,9,8,9]
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_04_04_00_00_04_04_04_12_12_12_08_08_12_12_12_12:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
	; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,0,1,0,1,8,9,8,9,8,9,14,15]
	; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,0,1,0,1,8,9,8,9,8,9,8,9]
	; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 4, i32 0, i32 0, i32 4, i32 4, i32 4, i32 12, i32 12, i32 12, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_04_04_00_04_04_04_12_08_12_12_08_12_12_12_12(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_04_04_00_04_04_04_12_08_12_12_08_12_12_12_12:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vpsllq $48, %xmm1, %xmm2
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,14,15]
	; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_00_04_04_00_04_04_04_12_08_12_12_08_12_12_12_12:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
	; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,14,15]
	; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
	; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 4, i32 4, i32 0, i32 4, i32 4, i32 4, i32 12, i32 8, i32 12, i32 12, i32 8, i32 12, i32 12, i32 12, i32 12>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_04_04_00_00_00_00_08_08_12_12_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_04_04_00_00_00_00_08_08_12_12_08_08_08_08_08:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,0,1,0,1,14,15]
	; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,0,1,0,1,0,1,0,1]
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_00_04_04_00_00_00_00_08_08_12_12_08_08_08_08_08:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
	; AVX2OR512VL-NEXT: vpbroadcastw %xmm1, %xmm2
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,0,1,0,1,14,15]
	; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,0,1,0,1,0,1,0,1]
	; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 4, i32 4, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 12, i32 12, i32 8, i32 8, i32 8, i32 8, i32 8>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7]
	; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,0,4,5,6,7]
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
	; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
	; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7]
	; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
	; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
	; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,0,4,5,6,7]
	; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 4, i32 4, i32 0, i32 4, i32 5, i32 6, i32 15, i32 8, i32 12, i32 12, i32 8, i32 12, i32 13, i32 14, i32 15>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_12(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_12:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vpsllq $48, %xmm1, %xmm2
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,8,9,8,9,8,9,8,9,14,15]
	; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,8,9,8,9,8,9,8,9,8,9,8,9]
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_12:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
	; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,8,9,8,9,8,9,8,9,14,15]
	; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,8,9,8,9,8,9,8,9,8,9,8,9]
	; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 undef, i32 4, i32 4, i32 4, i32 4, i32 4, i32 12, i32 8, i32 undef, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_12(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_12:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vpsllq $48, %xmm1, %xmm2
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,0,1,8,9,8,9,8,9,14,15]
	; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_12:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
	; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,0,1,8,9,8,9,8,9,14,15]
	; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
	; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 4, i32 undef, i32 0, i32 4, i32 4, i32 4, i32 12, i32 12, i32 12, i32 undef, i32 8, i32 12, i32 12, i32 12, i32 12>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_12(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_12:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vpsllq $48, %xmm1, %xmm2
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,14,15]
	; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_12:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
	; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,14,15]
	; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
	; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 4, i32 4, i32 0, i32 4, i32 4, i32 4, i32 12, i32 undef, i32 12, i32 12, i32 8, i32 12, i32 12, i32 12, i32 12>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_01_02_07_uu_uu_uu_uu_08_09_10_15_uu_uu_uu_uu(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_01_02_07_uu_uu_uu_uu_08_09_10_15_uu_uu_uu_uu:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,14,15,4,5,14,15,12,13,14,15]
	; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_00_01_02_07_uu_uu_uu_uu_08_09_10_15_uu_uu_uu_uu:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,14,15,4,5,14,15,12,13,14,15,16,17,18,19,20,21,30,31,20,21,30,31,28,29,30,31]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_00_01_02_07_uu_uu_uu_uu_08_09_10_15_uu_uu_uu_uu:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
	; AVX512VL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,7,6,7,8,9,10,11,12,15,14,15]
	; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 9, i32 10, i32 15, i32 undef, i32 undef, i32 undef, i32 undef>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_uu_uu_uu_uu_04_05_06_11_uu_uu_uu_uu_12_13_14_11(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_uu_uu_uu_uu_04_05_06_11_uu_uu_uu_uu_12_13_14_11:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,1,0,1]
	; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[12,13,6,7,4,5,6,7,8,9,10,11,12,13,6,7]
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_uu_uu_uu_uu_04_05_06_11_uu_uu_uu_uu_12_13_14_11:
	; AVX2: # BB#0:
	; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
	; AVX2-NEXT: vpbroadcastq %xmm1, %xmm2
	; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
	; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[12,13,6,7,4,5,6,7,8,9,10,11,12,13,6,7]
	; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_uu_uu_uu_uu_04_05_06_11_uu_uu_uu_uu_12_13_14_11:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
	; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm2
	; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
	; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,0]
	; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
	; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
	; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 4, i32 5, i32 6, i32 11, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 11>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_04_05_06_03_uu_uu_uu_uu_12_13_14_11_uu_uu_uu_uu(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_04_05_06_03_uu_uu_uu_uu_12_13_14_11_uu_uu_uu_uu:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,10,11,12,13,6,7,8,9,10,11,0,1,2,3]
	; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_04_05_06_03_uu_uu_uu_uu_12_13_14_11_uu_uu_uu_uu:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,6,7,8,9,10,11,0,1,2,3,24,25,26,27,28,29,22,23,24,25,26,27,16,17,18,19]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_04_05_06_03_uu_uu_uu_uu_12_13_14_11_uu_uu_uu_uu:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,2,0,7,5,6,4]
	; AVX512VL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,3,4,5,6,7,8,11,10,11,12,13,14,15]
	; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 5, i32 6, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 11, i32 undef, i32 undef, i32 undef, i32 undef>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15(<16 x i16> %a) {
	; AVX1-LABEL: shuffle_v16i16_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[1,1,2,3,4,5,6,7]
	; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4,5,6,7]
	; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3],zero,zero,ymm0[4,5],zero,zero,ymm0[8,9,u,u,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> <i32 1, i32 16, i32 2, i32 16, i32 4, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14_11(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14_11:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,14,15,8,9,10,11,12,13,6,7]
	; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm3
	; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
	; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14_11:
	; AVX2: # BB#0:
	; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
	; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,14,15,8,9,10,11,12,13,6,7]
	; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm3
	; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
	; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
	; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14_11:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
	; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
	; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
	; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
	; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,1,2]
	; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
	; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7]
	; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,1,2]
	; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 7, i32 4, i32 5, i32 6, i32 11, i32 8, i32 9, i32 10, i32 15, i32 12, i32 13, i32 14, i32 11>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_04_05_06_03_00_01_02_15_12_13_14_11_08_09_10_15(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_04_05_06_03_00_01_02_15_12_13_14_11_08_09_10_15:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,2,3]
	; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,6,7,0,1,2,3,4,5,14,15]
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_04_05_06_03_00_01_02_15_12_13_14_11_08_09_10_15:
	; AVX2: # BB#0:
	; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
	; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,2,3]
	; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
	; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,6,7,0,1,2,3,4,5,14,15]
	; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_04_05_06_03_00_01_02_15_12_13_14_11_08_09_10_15:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
	; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,2,3]
	; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
	; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,0]
	; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
	; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,0,3,1]
	; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 5, i32 6, i32 3, i32 0, i32 1, i32 2, i32 15, i32 12, i32 13, i32 14, i32 11, i32 8, i32 9, i32 10, i32 15>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11]
	; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm3
	; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
	; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13:
	; AVX2: # BB#0:
	; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
	; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11]
	; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm3
	; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
	; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
	; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
	; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11]
	; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3
	; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
	; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
	; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 7, i32 1, i32 0, i32 2, i32 7, i32 3, i32 13, i32 11, i32 15, i32 9, i32 8, i32 10, i32 15, i32 11, i32 13>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_16_01_17_02_18_03_27_08_24_09_25_10_26_11_27(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_27_08_24_09_25_10_26_11_27:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
	; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
	; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,14,15,14,15,8,9,12,13,14,15]
	; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
	; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_27_08_24_09_25_10_26_11_27:
	; AVX2: # BB#0:
	; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
	; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
	; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
	; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,2,4,5,6,7]
	; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7]
	; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
	; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
	; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_27_08_24_09_25_10_26_11_27:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,16,1,17,2,18,3,27,8,24,9,25,10,26,11,27]
	; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 27, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_20_01_21_02_22_03_31_08_28_09_29_10_30_11_31(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_20_01_21_02_22_03_31_08_28_09_29_10_30_11_31:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
	; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,0,1]
	; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
	; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,14,15,14,15,8,9,12,13,14,15]
	; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_00_20_01_21_02_22_03_31_08_28_09_29_10_30_11_31:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
	; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15]
	; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm2
	; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
	; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm0[7]
	; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0
	; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_00_20_01_21_02_22_03_31_08_28_09_29_10_30_11_31:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,20,1,21,2,22,3,31,8,28,9,29,10,30,11,31]
	; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 20, i32 1, i32 21, i32 2, i32 22, i32 3, i32 31, i32 8, i32 28, i32 9, i32 29, i32 10, i32 30, i32 11, i32 31>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_04_20_05_21_06_22_07_31_12_28_13_29_14_30_15_31(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_31_12_28_13_29_14_30_15_31:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
	; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
	; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,0,1,4,5,4,5,0,1,4,5,8,9,14,15]
	; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
	; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_31_12_28_13_29_14_30_15_31:
	; AVX2: # BB#0:
	; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
	; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
	; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
	; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,2,4,5,6,7]
	; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7]
	; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
	; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
	; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_31_12_28_13_29_14_30_15_31:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [4,20,5,21,6,22,7,31,12,28,13,29,14,30,15,31]
	; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 31, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_04_16_05_17_06_18_07_27_12_24_13_25_14_26_15_27(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_04_16_05_17_06_18_07_27_12_24_13_25_14_26_15_27:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
	; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
	; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
	; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,0,1,4,5,4,5,0,1,4,5,8,9,14,15]
	; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
	; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_04_16_05_17_06_18_07_27_12_24_13_25_14_26_15_27:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
	; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
	; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
	; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,0,1,10,11,2,3,12,13,4,5,14,15,6,7]
	; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
	; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
	; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_04_16_05_17_06_18_07_27_12_24_13_25_14_26_15_27:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [4,16,5,17,6,18,7,27,12,24,13,25,14,26,15,27]
	; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 16, i32 5, i32 17, i32 6, i32 18, i32 7, i32 27, i32 12, i32 24, i32 13, i32 25, i32 14, i32 26, i32 15, i32 27>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
	; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,3,2,3]
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
	; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3]
	; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,1,4,5,6,7]
	; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
	; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
	; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
	; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31:
	; AVX2: # BB#0:
	; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
	; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,1,4,5,6,7]
	; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
	; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3]
	; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,1,4,5,6,7]
	; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
	; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
	; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
	; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,5,7,7,8,9,10,11,14,13,15,15]
	; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,16,1,17,6,22,7,31,8,24,9,25,14,30,15,31]
	; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 6, i32 22, i32 7, i32 31, i32 8, i32 24, i32 9, i32 25, i32 14, i32 30, i32 15, i32 31>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_25(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_25:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
	; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,0,2,3]
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
	; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3]
	; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
	; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,10,11,0,1,2,3,2,3,0,1,12,13,2,3]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
	; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
	; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_25:
	; AVX2: # BB#0:
	; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
	; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,8,9,4,5,10,11,0,1,0,1,12,13,2,3]
	; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm4
	; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7]
	; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
	; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1
	; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
	; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,5,7,7,8,9,10,11,14,13,15,15]
	; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_25:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,20,1,21,6,16,7,25,8,28,9,29,14,24,15,25]
	; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 20, i32 1, i32 21, i32 6, i32 16, i32 7, i32 25, i32 8, i32 28, i32 9, i32 29, i32 14, i32 24, i32 15, i32 25>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_01_00_17_16_03_02_19_26_09_08_25_24_11_10_27_26(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_01_00_17_16_03_02_19_26_09_08_25_24_11_10_27_26:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
	; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,0,1,12,13,10,11,8,9,10,11,12,13,10,11]
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[1,0,3,2,4,5,6,7]
	; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,3,1,4,5,6,7]
	; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,7,5]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_01_00_17_16_03_02_19_26_09_08_25_24_11_10_27_26:
	; AVX2: # BB#0:
	; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
	; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
	; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7]
	; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5]
	; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,2,3,0,1,8,9,10,11,6,7,4,5]
	; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
	; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,4,5,6,7,6,7,4,5,4,5,6,7,18,19,16,17,20,21,22,23,22,23,20,21,20,21,22,23]
	; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_01_00_17_16_03_02_19_26_09_08_25_24_11_10_27_26:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [1,0,17,16,3,2,19,26,9,8,25,24,11,10,27,26]
	; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 1, i32 0, i32 17, i32 16, i32 3, i32 2, i32 19, i32 26, i32 9, i32 8, i32 25, i32 24, i32 11, i32 10, i32 27, i32 26>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_16_00_17_01_18_02_19_11_24_08_25_09_26_10_27_11(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_16_00_17_01_18_02_19_11_24_08_25_09_26_10_27_11:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
	; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
	; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,14,15,14,15,8,9,12,13,14,15]
	; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
	; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_16_00_17_01_18_02_19_11_24_08_25_09_26_10_27_11:
	; AVX2: # BB#0:
	; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
	; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
	; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
	; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
	; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
	; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
	; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
	; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_16_00_17_01_18_02_19_11_24_08_25_09_26_10_27_11:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,16,1,17,2,18,3,27,8,24,9,25,10,26,11,27]
	; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
	; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 0, i32 17, i32 1, i32 18, i32 2, i32 19, i32 11, i32 24, i32 8, i32 25, i32 9, i32 26, i32 10, i32 27, i32 11>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_20_04_21_05_22_06_23_15_28_12_29_13_30_14_31_15(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_20_04_21_05_22_06_23_15_28_12_29_13_30_14_31_15:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
	; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
	; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,4,5,4,5,0,1,4,5,8,9,14,15]
	; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
	; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_20_04_21_05_22_06_23_15_28_12_29_13_30_14_31_15:
	; AVX2: # BB#0:
	; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
	; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
	; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
	; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
	; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
	; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
	; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
	; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_20_04_21_05_22_06_23_15_28_12_29_13_30_14_31_15:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [4,20,5,21,6,22,7,31,12,28,13,29,14,30,15,31]
	; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
	; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 20, i32 4, i32 21, i32 5, i32 22, i32 6, i32 23, i32 15, i32 28, i32 12, i32 29, i32 13, i32 30, i32 14, i32 31, i32 15>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,1,3,4,5,6,7]
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
	; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,1,3,4,5,6,7]
	; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
	; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,4,5,14,15,0,1,4,5,4,5,6,7]
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
	; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
	; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
	; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
	; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7]
	; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
	; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,1,3,4,5,6,7]
	; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,5,7]
	; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,2,1,3,20,22,21,31,8,10,9,11,28,30,29,31]
	; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 2, i32 1, i32 3, i32 20, i32 22, i32 21, i32 31, i32 8, i32 10, i32 9, i32 11, i32 28, i32 30, i32 29, i32 31>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_04_04_03_18_uu_uu_uu_uu_12_12_11_26_uu_uu_uu_uu(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_04_04_03_18_uu_uu_uu_uu_12_12_11_26_uu_uu_uu_uu:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,3,2,4,5,6,7]
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_04_04_03_18_uu_uu_uu_uu_12_12_11_26_uu_uu_uu_uu:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15]
	; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,2,3,6,5,6,7]
	; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,3,2,4,5,6,7,8,8,11,10,12,13,14,15]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_04_04_03_18_uu_uu_uu_uu_12_12_11_26_uu_uu_uu_uu:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = <4,4,3,18,u,u,u,u,12,12,11,26,u,u,u,u>
	; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 4, i32 3, i32 18, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 12, i32 11, i32 26, i32 undef, i32 undef, i32 undef, i32 undef>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_03_02_21_uu_uu_uu_uu_08_11_10_29_uu_uu_uu_uu(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_03_02_21_uu_uu_uu_uu_08_11_10_29_uu_uu_uu_uu:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
	; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
	; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,6,7,4,5,10,11,0,1,10,11,0,1,2,3]
	; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
	; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
	; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_00_03_02_21_uu_uu_uu_uu_08_11_10_29_uu_uu_uu_uu:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
	; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,6,7,4,5,10,11,0,1,10,11,0,1,2,3,16,17,22,23,20,21,26,27,16,17,26,27,16,17,18,19]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_00_03_02_21_uu_uu_uu_uu_08_11_10_29_uu_uu_uu_uu:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = <0,3,2,21,u,u,u,u,8,11,10,29,u,u,u,u>
	; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 3, i32 2, i32 21, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 11, i32 10, i32 29, i32 undef, i32 undef, i32 undef, i32 undef>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_uu_uu_uu_21_uu_uu_uu_uu_uu_uu_uu_29_uu_uu_uu_uu(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_uu_uu_uu_21_uu_uu_uu_uu_uu_uu_uu_29_uu_uu_uu_uu:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm1[0,2,2,3,4,6,6,7]
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_uu_uu_uu_21_uu_uu_uu_uu_uu_uu_uu_29_uu_uu_uu_uu:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm1[0,2,2,3,4,6,6,7]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 21, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 29, i32 undef, i32 undef, i32 undef, i32 undef>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_01_02_21_uu_uu_uu_uu_08_09_10_29_uu_uu_uu_uu(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_01_02_21_uu_uu_uu_uu_08_09_10_29_uu_uu_uu_uu:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
	; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
	; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
	; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_00_01_02_21_uu_uu_uu_uu_08_09_10_29_uu_uu_uu_uu:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
	; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7,8,9,10],ymm1[11],ymm0[12,13,14,15]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_00_01_02_21_uu_uu_uu_uu_08_09_10_29_uu_uu_uu_uu:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = <0,1,2,21,u,u,u,u,8,9,10,29,u,u,u,u>
	; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 21, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 9, i32 10, i32 29, i32 undef, i32 undef, i32 undef, i32 undef>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_uu_uu_uu_uu_20_21_22_11_uu_uu_uu_uu_28_29_30_11(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_uu_uu_uu_uu_20_21_22_11_uu_uu_uu_uu_28_29_30_11:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
	; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm0[7]
	; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7]
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_uu_uu_uu_uu_20_21_22_11_uu_uu_uu_uu_28_29_30_11:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,2]
	; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_uu_uu_uu_uu_20_21_22_11_uu_uu_uu_uu_28_29_30_11:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = <u,u,u,u,4,5,6,27,u,u,u,u,12,13,14,27>
	; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
	; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 20, i32 21, i32 22, i32 11, i32 undef, i32 undef, i32 undef, i32 undef, i32 28, i32 29, i32 30, i32 11>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_20_21_22_03_uu_uu_uu_uu_28_29_30_11_uu_uu_uu_uu(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_20_21_22_03_uu_uu_uu_uu_28_29_30_11_uu_uu_uu_uu:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
	; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
	; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5,6,7]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
	; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7]
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_20_21_22_03_uu_uu_uu_uu_28_29_30_11_uu_uu_uu_uu:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
	; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7,8,9,10],ymm0[11],ymm1[12,13,14,15]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_20_21_22_03_uu_uu_uu_uu_28_29_30_11_uu_uu_uu_uu:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = <4,5,6,19,u,u,u,u,12,13,14,27,u,u,u,u>
	; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
	; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 20, i32 21, i32 22, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 28, i32 29, i32 30, i32 11, i32 undef, i32 undef, i32 undef, i32 undef>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_01_02_21_20_21_22_11_08_09_10_29_28_29_30_11(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_01_02_21_20_21_22_11_08_09_10_29_28_29_30_11:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
	; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,2,2,3]
	; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3,4,5,6],xmm0[7]
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
	; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,10,11,8,9,10,11,12,13,6,7]
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_00_01_02_21_20_21_22_11_08_09_10_29_28_29_30_11:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
	; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
	; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
	; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,10,11,8,9,10,11,12,13,6,7]
	; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
	; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
	; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_00_01_02_21_20_21_22_11_08_09_10_29_28_29_30_11:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,1,2,21,20,21,22,11,8,9,10,29,28,29,30,11]
	; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 21, i32 20, i32 21, i32 22, i32 11, i32 8, i32 9, i32 10, i32 29, i32 28, i32 29, i32 30, i32 11>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_00_17_02_03_20_21_22_15_08_25_10_11_28_29_30_15(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_00_17_02_03_20_21_22_15_08_25_10_11_28_29_30_15:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
	; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4,5,6],xmm3[7]
	; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
	; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4,5,6],xmm0[7]
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_00_17_02_03_20_21_22_15_08_25_10_11_28_29_30_15:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
	; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4,5,6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12,13,14],ymm0[15]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_00_17_02_03_20_21_22_15_08_25_10_11_28_29_30_15:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,17,2,3,20,21,22,15,8,25,10,11,28,29,30,15]
	; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 20, i32 21, i32 22, i32 15, i32 8, i32 25, i32 10, i32 11, i32 28, i32 29, i32 30, i32 15>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_25(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_25:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
	; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7]
	; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7]
	; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm1[7]
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
	; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
	; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_25:
	; AVX2: # BB#0:
	; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
	; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1
	; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,2,1,4,5,6,7,8,9,10,9,12,13,14,15]
	; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,7,7,8,9,10,11,12,13,15,15]
	; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_25:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = <u,u,u,1,u,5,7,25,u,u,u,9,u,13,15,25>
	; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 5, i32 7, i32 25, i32 undef, i32 undef, i32 undef, i32 9, i32 undef, i32 13, i32 15, i32 25>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_uu_uu_04_uu_16_18_20_uu_uu_uu_12_uu_24_26_28_uu(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_uu_uu_04_uu_16_18_20_uu_uu_uu_12_uu_24_26_28_uu:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
	; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,4,5,6,7,0,1,4,5,8,9,4,5]
	; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
	; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3]
	; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
	; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
	; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
	; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_uu_uu_04_uu_16_18_20_uu_uu_uu_12_uu_24_26_28_uu:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,4,5,16,17,20,21,20,21,22,23,16,17,20,21,24,25,20,21]
	; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
	; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_uu_uu_04_uu_16_18_20_uu_uu_uu_12_uu_24_26_28_uu:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = <u,u,20,u,0,2,4,u,u,u,28,u,8,10,12,u>
	; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
	; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 undef, i32 4, i32 undef, i32 16, i32 18, i32 20, i32 undef, i32 undef, i32 undef, i32 12, i32 undef, i32 24, i32 26, i32 28, i32 undef>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_21_22_23_00_01_02_03_12_29_30_31_08_09_10_11_12(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_21_22_23_00_01_02_03_12_29_30_31_08_09_10_11_12:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
	; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9]
	; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
	; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9]
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_21_22_23_00_01_02_03_12_29_30_31_08_09_10_11_12:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
	; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
	; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7]
	; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
	; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
	; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_21_22_23_00_01_02_03_12_29_30_31_08_09_10_11_12:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [21,22,23,0,1,2,3,12,29,30,31,8,9,10,11,12]
	; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 12, i32 29, i32 30, i32 31, i32 8, i32 9, i32 10, i32 11, i32 12>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_uu_22_uu_uu_01_02_03_uu_uu_30_uu_uu_09_10_11_uu(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_uu_22_uu_uu_01_02_03_uu_uu_30_uu_uu_09_10_11_uu:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
	; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9]
	; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9]
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_uu_22_uu_uu_01_02_03_uu_uu_30_uu_uu_09_10_11_uu:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9],ymm1[26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 22, i32 undef, i32 undef, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 30, i32 undef, i32 undef, i32 9, i32 10, i32 11, i32 undef>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_05_06_07_00_01_02_03_12_13_14_15_08_09_10_11_12(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_05_06_07_00_01_02_03_12_13_14_15_08_09_10_11_12:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7]
	; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
	; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_05_06_07_00_01_02_03_12_13_14_15_08_09_10_11_12:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
	; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7]
	; AVX2OR512VL-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
	; AVX2OR512VL-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
	; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_uu_06_uu_uu_01_02_03_uu_uu_14_uu_uu_09_10_11_uu(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_uu_06_uu_uu_01_02_03_uu_uu_14_uu_uu_09_10_11_uu:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_uu_06_uu_uu_01_02_03_uu_uu_14_uu_uu_09_10_11_uu:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 6, i32 undef, i32 undef, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 14, i32 undef, i32 undef, i32 9, i32 10, i32 11, i32 undef>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_uu_uu_uu_uu_01_02_03_uu_uu_uu_uu_uu_09_10_11_uu(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_uu_uu_uu_uu_01_02_03_uu_uu_uu_uu_uu_09_10_11_uu:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_uu_uu_uu_uu_01_02_03_uu_uu_uu_uu_uu_09_10_11_uu:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9],zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 9, i32 10, i32 11, i32 undef>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_19_20_21_22_23_00_01_10_27_28_29_30_31_08_09_10(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_19_20_21_22_23_00_01_10_27_28_29_30_31_08_09_10:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
	; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5]
	; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7,8,9,0,1,4,5,10,11]
	; AVX1-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
	; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7]
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_19_20_21_22_23_00_01_10_27_28_29_30_31_08_09_10:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
	; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
	; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7]
	; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
	; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
	; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_19_20_21_22_23_00_01_10_27_28_29_30_31_08_09_10:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [3,4,5,6,7,16,17,26,11,12,13,14,15,24,25,26]
	; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
	; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 10, i32 27, i32 28, i32 29, i32 30, i32 31, i32 8, i32 9, i32 10>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_uu_20_21_22_uu_uu_01_uu_uu_28_29_30_uu_uu_09_uu(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_uu_20_21_22_uu_uu_01_uu_uu_28_29_30_uu_uu_09_uu:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
	; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5]
	; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5]
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_uu_20_21_22_uu_uu_01_uu_uu_28_29_30_uu_uu_09_uu:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5],ymm1[22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 20, i32 21, i32 22, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 28, i32 29, i32 30, i32 undef, i32 undef, i32 9, i32 undef>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09_10(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09_10:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7]
	; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
	; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09_10:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
	; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7]
	; AVX2OR512VL-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
	; AVX2OR512VL-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
	; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_uu_04_05_06_uu_uu_01_uu_uu_12_13_14_uu_uu_09_uu(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_uu_04_05_06_uu_uu_01_uu_uu_12_13_14_uu_uu_09_uu:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_uu_04_05_06_uu_uu_01_uu_uu_12_13_14_uu_uu_09_uu:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 4, i32 5, i32 6, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 undef, i32 undef, i32 9, i32 undef>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_uu_04_05_06_uu_uu_uu_uu_uu_12_13_14_uu_uu_uu_uu(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_uu_04_05_06_uu_uu_uu_uu_uu_12_13_14_uu_uu_uu_uu:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpsrldq {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_uu_04_05_06_uu_uu_uu_uu_uu_12_13_14_uu_uu_uu_uu:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 4, i32 5, i32 6, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 undef, i32 undef, i32 undef, i32 undef>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_03_04_05_06_07_16_17_26_11_12_13_14_15_24_25_26(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_03_04_05_06_07_16_17_26_11_12_13_14_15_24_25_26:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
	; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5]
	; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,8,9,0,1,4,5,10,11]
	; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
	; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_03_04_05_06_07_16_17_26_11_12_13_14_15_24_25_26:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
	; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
	; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7]
	; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
	; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
	; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_03_04_05_06_07_16_17_26_11_12_13_14_15_24_25_26:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [3,4,5,6,7,16,17,26,11,12,13,14,15,24,25,26]
	; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 26, i32 11, i32 12, i32 13, i32 14, i32 15, i32 24, i32 25, i32 26>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_uu_04_05_06_uu_uu_17_uu_uu_12_13_14_uu_uu_25_uu(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_uu_04_05_06_uu_uu_17_uu_uu_12_13_14_uu_uu_25_uu:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
	; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5]
	; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5]
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_uu_04_05_06_uu_uu_17_uu_uu_12_13_14_uu_uu_25_uu:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5],ymm0[22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 4, i32 5, i32 6, i32 undef, i32 undef, i32 17, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 undef, i32 undef, i32 25, i32 undef>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_28(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_28:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
	; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9]
	; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
	; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9]
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_28:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15]
	; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
	; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7]
	; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
	; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
	; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_28:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [21,22,23,0,1,2,3,12,29,30,31,8,9,10,11,12]
	; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
	; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 28, i32 13, i32 14, i32 15, i32 24, i32 25, i32 26, i32 27, i32 28>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_uu_06_uu_uu_17_18_19_uu_uu_14_uu_uu_25_26_27_uu(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_uu_06_uu_uu_17_18_19_uu_uu_14_uu_uu_25_26_27_uu:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
	; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9]
	; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9]
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_uu_06_uu_uu_17_18_19_uu_uu_14_uu_uu_25_26_27_uu:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9],ymm0[26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 6, i32 undef, i32 undef, i32 17, i32 18, i32 19, i32 undef, i32 undef, i32 14, i32 undef, i32 undef, i32 25, i32 26, i32 27, i32 undef>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_23_uu_03_uu_20_20_05_uu_31_uu_11_uu_28_28_13_uu(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_23_uu_03_uu_20_20_05_uu_31_uu_11_uu_28_28_13_uu:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
	; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
	; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,4,4]
	; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
	; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
	; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,4,4]
	; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_23_uu_03_uu_20_20_05_uu_31_uu_11_uu_28_28_13_uu:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8,9,10,11],ymm1[12],ymm0[13,14],ymm1[15]
	; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,14,15,6,7,6,7,8,9,8,9,10,11,14,15,30,31,30,31,22,23,22,23,24,25,24,25,26,27,30,31]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_23_uu_03_uu_20_20_05_uu_31_uu_11_uu_28_28_13_uu:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = <7,u,19,u,4,4,21,u,15,u,27,u,12,12,29,u>
	; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
	; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 23, i32 undef, i32 3, i32 undef, i32 20, i32 20, i32 5, i32 undef, i32 31, i32 undef, i32 11, i32 undef, i32 28, i32 28, i32 13, i32 undef>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_u_u_u_u_u_u_u_u_0_16_1_17_2_18_3_19(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_0_16_1_17_2_18_3_19:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_0_16_1_17_2_18_3_19:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
	; AVX2OR512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
	; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
	; AVX2OR512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
	; AVX2OR512VL-NEXT: vpbroadcastw %xmm0, %ymm0
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_4_20_5_21_6_22_7_23_u_u_u_u_u_u_u_u(<16 x i16> %a, <16 x i16> %b) {
	; ALL-LABEL: shuffle_v16i16_4_20_5_21_6_22_7_23_u_u_u_u_u_u_u_u:
	; ALL: # BB#0:
	; ALL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
	; ALL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u(<16 x i16> %a, <16 x i16> %b) {
	; ALL-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u:
	; ALL: # BB#0:
	; ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
	; ALL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
	; ALL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
	; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
	; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @shuffle_v16i16_02_18_03_19_00_16_01_17_10_26_11_27_08_24_09_25(<16 x i16> %a0, <16 x i16> %a1) {
	; AVX1-LABEL: shuffle_v16i16_02_18_03_19_00_16_01_17_10_26_11_27_08_24_09_25:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_02_18_03_19_00_16_01_17_10_26_11_27_08_24_09_25:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,4,5,6,7,0,1,0,1,12,13,2,3,16,17,20,21,20,21,22,23,16,17,16,17,28,29,18,19]
	; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,6,7,6,7,0,1,2,3,2,3,14,15,20,21,18,19,22,23,22,23,16,17,18,19,18,19,30,31]
	; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_02_18_03_19_00_16_01_17_10_26_11_27_08_24_09_25:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [2,18,3,19,0,16,1,17,10,26,11,27,8,24,9,25]
	; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
	; AVX512VL-NEXT: retq
	%1 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 0, i32 16, i32 1, i32 17, i32 10, i32 26, i32 11, i32 27, i32 8, i32 24, i32 9, i32 25>
	ret <16 x i16> %1
	}

	define <16 x i16> @shuffle_v16i16_02_18_03_19_10_26_11_27_00_16_01_17_08_24_09_25(<16 x i16> %a0, <16 x i16> %a1) {
	; AVX1-LABEL: shuffle_v16i16_02_18_03_19_10_26_11_27_00_16_01_17_08_24_09_25:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
	; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,1,0,1]
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,2,4,5,6,7]
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
	; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,0,1]
	; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,7,7]
	; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,2,4,5,6,7]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
	; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,7,7]
	; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v16i16_02_18_03_19_10_26_11_27_00_16_01_17_08_24_09_25:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,4,5,6,7,0,1,0,1,12,13,2,3,16,17,20,21,20,21,22,23,16,17,16,17,28,29,18,19]
	; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,6,7,6,7,0,1,2,3,2,3,14,15,20,21,18,19,22,23,22,23,16,17,18,19,18,19,30,31]
	; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
	; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v16i16_02_18_03_19_10_26_11_27_00_16_01_17_08_24_09_25:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [2,18,3,19,0,16,1,17,10,26,11,27,8,24,9,25]
	; AVX512VL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
	; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm2[0,2,1,3]
	; AVX512VL-NEXT: retq
	%1 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 0, i32 16, i32 1, i32 17, i32 10, i32 26, i32 11, i32 27, i32 8, i32 24, i32 9, i32 25>
	%2 = bitcast <16 x i16> %1 to <4 x i64>
	%3 = shufflevector <4 x i64> %2, <4 x i64> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
	%4 = bitcast <4 x i64> %3 to <16 x i16>
	ret <16 x i16> %4
	}

	define <16 x i16> @insert_v16i16_0elt_into_zero_vector(i16* %ptr) {
	; ALL-LABEL: insert_v16i16_0elt_into_zero_vector:
	; ALL: # BB#0:
	; ALL-NEXT: movzwl (%rdi), %eax
	; ALL-NEXT: vmovd %eax, %xmm0
	; ALL-NEXT: retq
	%val = load i16, i16* %ptr
	%i0 = insertelement <16 x i16> zeroinitializer, i16 %val, i32 0
	ret <16 x i16> %i0
	}

	define <16 x i16> @concat_v16i16_0_1_2_3_4_5_6_7_24_25_26_27_28_29_30_31(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: concat_v16i16_0_1_2_3_4_5_6_7_24_25_26_27_28_29_30_31:
	; AVX1: # BB#0:
	; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: concat_v16i16_0_1_2_3_4_5_6_7_24_25_26_27_28_29_30_31:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
	; AVX2OR512VL-NEXT: retq
	%alo = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
	%bhi = shufflevector <16 x i16> %b, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
	%shuf = shufflevector <8 x i16> %alo, <8 x i16> %bhi, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
	ret <16 x i16> %shuf
	}

	define <16 x i16> @concat_v16i16_8_9_10_11_12_13_14_15_24_25_26_27_28_29_30_31_bc(<16 x i16> %a, <16 x i16> %b) {
	; ALL-LABEL: concat_v16i16_8_9_10_11_12_13_14_15_24_25_26_27_28_29_30_31_bc:
	; ALL: # BB#0:
	; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
	; ALL-NEXT: retq
	%ahi = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
	%bhi = shufflevector <16 x i16> %b, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
	%bc0hi = bitcast <8 x i16> %ahi to <16 x i8>
	%bc1hi = bitcast <8 x i16> %bhi to <16 x i8>
	%shuffle8 = shufflevector <16 x i8> %bc0hi, <16 x i8> %bc1hi, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
	%shuffle16 = bitcast <32 x i8> %shuffle8 to <16 x i16>
	ret <16 x i16> %shuffle16
	}

	define <16 x i16> @PR24935(<16 x i16> %a, <16 x i16> %b) {
	; AVX1-LABEL: PR24935:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,1,1]
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
	; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1]
	; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3,4,5,6,7]
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
	; AVX1-NEXT: vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,5,5,6,7]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[2,3,2,3,4,5,6,7,8,9,8,9,0,1,2,3]
	; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6,7]
	; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3],xmm5[4,5,6],xmm2[7]
	; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
	; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
	; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3,4,5],xmm1[6,7]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,4,5,4,5,10,11,4,5,14,15,12,13,0,1]
	; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4,5],xmm0[6],xmm1[7]
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: PR24935:
	; AVX2: # BB#0:
	; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
	; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,4,5,0,1,10,11,4,5,10,11,4,5,6,7,22,23,20,21,16,17,26,27,20,21,26,27,20,21,22,23]
	; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[8,9,10,11,4,5,8,9,0,1,14,15,12,13,0,1,24,25,26,27,20,21,24,25,16,17,30,31,28,29,16,17]
	; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,u,u,255,255,255,255,0,0,u,u,0,0,u,u,u,u,255,255,0,0,u,u,u,u,u,u,0,0>
	; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
	; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
	; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,6,7,u,u,18,19,u,u,u,u,u,u,u,u,24,25,16,17,u,u]
	; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
	; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,5,6,7,8,9,10,11,13,13,14,15]
	; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4],ymm2[5,6,7,8],ymm0[9,10],ymm2[11],ymm0[12],ymm2[13,14,15]
	; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,0,0,255,255,0,0,0,0,255,255,255,255,0,0,0,0,0,0,255,255]
	; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: PR24935:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [11,10,17,13,10,7,27,0,17,25,0,12,29,20,16,8]
	; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
	; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 27, i32 26, i32 1, i32 29, i32 26, i32 23, i32 11, i32 16, i32 1, i32 9, i32 16, i32 28, i32 13, i32 4, i32 0, i32 24>
	ret <16 x i16> %shuffle
	}

	define <16 x i16> @insert_dup_mem_v16i16_i32(i32* %ptr) {
	; AVX1-LABEL: insert_dup_mem_v16i16_i32:
	; AVX1: # BB#0:
	; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: insert_dup_mem_v16i16_i32:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpbroadcastw (%rdi), %ymm0
	; AVX2OR512VL-NEXT: retq
	%tmp = load i32, i32* %ptr, align 4
	%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
	%tmp2 = bitcast <4 x i32> %tmp1 to <8 x i16>
	%tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <16 x i32> zeroinitializer
	ret <16 x i16> %tmp3
	}

	define <16 x i16> @insert_dup_mem_v16i16_sext_i16(i16* %ptr) {
	; AVX1-LABEL: insert_dup_mem_v16i16_sext_i16:
	; AVX1: # BB#0:
	; AVX1-NEXT: movswl (%rdi), %eax
	; AVX1-NEXT: vmovd %eax, %xmm0
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: insert_dup_mem_v16i16_sext_i16:
	; AVX2: # BB#0:
	; AVX2-NEXT: movswl (%rdi), %eax
	; AVX2-NEXT: vmovd %eax, %xmm0
	; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: insert_dup_mem_v16i16_sext_i16:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: movswl (%rdi), %eax
	-; AVX512VL-NEXT: vpbroadcastw %ax, %ymm0
	+; AVX512VL-NEXT: vpbroadcastw %eax, %ymm0
	; AVX512VL-NEXT: retq
	%tmp = load i16, i16* %ptr, align 2
	%tmp1 = sext i16 %tmp to i32
	%tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0
	%tmp3 = bitcast <4 x i32> %tmp2 to <8 x i16>
	%tmp4 = shufflevector <8 x i16> %tmp3, <8 x i16> undef, <16 x i32> zeroinitializer
	ret <16 x i16> %tmp4
	}

	define <16 x i16> @insert_dup_elt1_mem_v16i16_i32(i32* %ptr) #0 {
	; AVX1-LABEL: insert_dup_elt1_mem_v16i16_i32:
	; AVX1: # BB#0:
	; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: insert_dup_elt1_mem_v16i16_i32:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpbroadcastw 2(%rdi), %ymm0
	; AVX2OR512VL-NEXT: retq
	%tmp = load i32, i32* %ptr, align 4
	%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
	%tmp2 = bitcast <4 x i32> %tmp1 to <8 x i16>
	%tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
	ret <16 x i16> %tmp3
	}

	define <16 x i16> @insert_dup_elt3_mem_v16i16_i32(i32* %ptr) #0 {
	; AVX1-LABEL: insert_dup_elt3_mem_v16i16_i32:
	; AVX1: # BB#0:
	; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: insert_dup_elt3_mem_v16i16_i32:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpbroadcastw 2(%rdi), %ymm0
	; AVX2OR512VL-NEXT: retq
	%tmp = load i32, i32* %ptr, align 4
	%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 1
	%tmp2 = bitcast <4 x i32> %tmp1 to <8 x i16>
	%tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
	ret <16 x i16> %tmp3
	}
	diff --git a/test/CodeGen/X86/vector-shuffle-256-v32.ll b/test/CodeGen/X86/vector-shuffle-256-v32.ll
	index 05a797cb6f8e..d51b69415b93 100644
	--- a/test/CodeGen/X86/vector-shuffle-256-v32.ll
	+++ b/test/CodeGen/X86/vector-shuffle-256-v32.ll
	@@ -1,2442 +1,2442 @@
	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx \| FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX1
	; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 \| FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2
	; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw \| FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX512VL

	define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpbroadcastb %xmm0, %ymm0
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]
	; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0]
	; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0]
	; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0]
	; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0]
	; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0]
	; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0]
	; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8]
	; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0]
	; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 9, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,10,0,0]
	; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,11,0,0,0]
	; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 11, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,12,0,0,0,0]
	; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 12, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,13,0,0,0,0,0]
	; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 13, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,14,0,0,0,0,0,0]
	; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 14, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: movl $15, %eax
	; AVX1-NEXT: vmovd %eax, %xmm1
	; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
	; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,15,0,0,0,0,0,0,0]
	; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
	; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1]
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX2: # BB#0:
	; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
	; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
	; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
	; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
	; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
	; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
	; AVX512VL-NEXT: vpxor %ymm2, %ymm2, %ymm2
	; AVX512VL-NEXT: vpshufb %ymm2, %ymm1, %ymm1
	; AVX512VL-NEXT: vpbroadcastb %xmm0, %xmm0
	; AVX512VL-NEXT: movl $32767, %eax # imm = 0x7FFF
	; AVX512VL-NEXT: kmovd %eax, %k1
	; AVX512VL-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
	; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
	; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0]
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX2: # BB#0:
	; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
	; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,255,u,u,u,u,u,u,u,u,u,u,u,u,u,u,255,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
	; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
	; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
	; AVX512VL-NEXT: movl $1, %eax
	; AVX512VL-NEXT: kmovd %eax, %k1
	; AVX512VL-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
	; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 17, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
	; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0]
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX2: # BB#0:
	; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
	; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
	; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
	; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
	; AVX512VL-NEXT: movw $1, %ax
	; AVX512VL-NEXT: kmovd %eax, %k1
	; AVX512VL-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1}
	; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 18, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
	; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0]
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX2: # BB#0:
	; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
	; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
	; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
	; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
	; AVX512VL-NEXT: movw $1, %ax
	; AVX512VL-NEXT: kmovd %eax, %k1
	; AVX512VL-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1}
	; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 19, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
	; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0]
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
	; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
	; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,11,0,0,0,0,0]
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
	; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 21, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
	; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,13,0,0,0,0,0,0]
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
	; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 22, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
	; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,15,0,0,0,0,0,0,0]
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
	; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 23, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
	; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,xmm2[8],zero,zero,zero,zero,zero,zero,zero,zero
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0],zero,xmm0[0,0,0,0,0,0,0,0]
	; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
	; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
	; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
	; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 24, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
	; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm2[9],zero,zero,zero,zero,zero,zero,zero,zero,zero
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0],zero,xmm0[0,0,0,0,0,0,0,0,0]
	; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
	; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
	; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
	; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 25, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
	; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm2[10],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0],zero,xmm0[0,0,0,0,0,0,0,0,0,0]
	; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
	; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
	; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
	; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 26, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
	; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,xmm2[11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0],zero,xmm0[0,0,0,0,0,0,0,0,0,0,0]
	; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
	; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
	; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
	; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 27, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
	; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0],zero,xmm0[0,0,0,0,0,0,0,0,0,0,0,0]
	; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
	; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
	; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
	; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 28, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
	; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0],zero,xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0]
	; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
	; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
	; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
	; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 29, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
	; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[14],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0]
	; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
	; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
	; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
	; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 30, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
	; AVX1-NEXT: movl $128, %eax
	; AVX1-NEXT: vmovd %eax, %xmm2
	; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm2
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
	; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
	; AVX2-NEXT: movl $15, %eax
	; AVX2-NEXT: vmovd %eax, %xmm1
	; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
	; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
	; AVX512VL-NEXT: movl $15, %eax
	; AVX512VL-NEXT: vmovd %eax, %xmm1
	; AVX512VL-NEXT: vpshufb %ymm1, %ymm0, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 31, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
	; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpxor %ymm1, %ymm1, %ymm1
	; AVX2OR512VL-NEXT: vpshufb %ymm1, %ymm0, %ymm0
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
	; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
	; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15_23_23_23_23_23_23_23_23_31_31_31_31_31_31_31_31(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15_23_23_23_23_23_23_23_23_31_31_31_31_31_31_31_31:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,15,15,15,15,15,15,15,15]
	; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15_23_23_23_23_23_23_23_23_31_31_31_31_31_31_31_31:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,7,7,7,7,7,7,7,15,15,15,15,15,15,15,15,23,23,23,23,23,23,23,23,31,31,31,31,31,31,31,31]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20_24_24_24_24_28_28_28_28(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20_24_24_24_24_28_28_28_28:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12]
	; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20_24_24_24_24_28_28_28_28:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12,16,16,16,16,20,20,20,20,24,24,24,24,28,28,28,28]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12, i32 16, i32 16, i32 16, i32 16, i32 20, i32 20, i32 20, i32 20, i32 24, i32 24, i32 24, i32 24, i32 28, i32 28, i32 28, i32 28>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15_19_19_19_19_23_23_23_23_27_27_27_27_31_31_31_31(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15_19_19_19_19_23_23_23_23_27_27_27_27_31_31_31_31:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
	; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15_19_19_19_19_23_23_23_23_27_27_27_27_31_31_31_31:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15,19,19,19,19,23,23,23,23,27,27,27,27,31,31,31,31]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7, i32 11, i32 11, i32 11, i32 11, i32 15, i32 15, i32 15, i32 15, i32 19, i32 19, i32 19, i32 19, i32 23, i32 23, i32 23, i32 23, i32 27, i32 27, i32 27, i32 27, i32 31, i32 31, i32 31, i32 31>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14_16_16_18_18_20_20_22_22_24_24_26_26_28_28_30_30(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14_16_16_18_18_20_20_22_22_24_24_26_26_28_28_30_30:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
	; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14_16_16_18_18_20_20_22_22_24_24_26_26_28_28_30_30:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14,16,16,18,18,20,20,22,22,24,24,26,26,28,28,30,30]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14, i32 16, i32 16, i32 18, i32 18, i32 20, i32 20, i32 22, i32 22, i32 24, i32 24, i32 26, i32 26, i32 28, i32 28, i32 30, i32 30>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15_17_17_19_19_21_21_23_23_25_25_27_27_29_29_31_31(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15_17_17_19_19_21_21_23_23_25_25_27_27_29_29_31_31:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
	; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15_17_17_19_19_21_21_23_23_25_25_27_27_29_29_31_31:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15,17,17,19,19,21,21,23,23,25,25,27,27,29,29,31,31]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15, i32 17, i32 17, i32 19, i32 19, i32 21, i32 21, i32 23, i32 23, i32 25, i32 25, i32 27, i32 27, i32 29, i32 29, i32 31, i32 31>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]
	; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]
	; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0]
	; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0]
	; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0]
	; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0]
	; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
	; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
	; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
	; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
	; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 14, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 14, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX1: # BB#0:
	; AVX1-NEXT: movl $15, %eax
	; AVX1-NEXT: vmovd %eax, %xmm1
	; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX2: # BB#0:
	; AVX2-NEXT: movl $15, %eax
	; AVX2-NEXT: vmovd %eax, %xmm1
	; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0
	; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: movl $15, %eax
	; AVX512VL-NEXT: vmovd %eax, %xmm1
	; AVX512VL-NEXT: vpshufb %xmm1, %xmm0, %xmm0
	; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63:
	; AVX1: # BB#0:
	; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
	; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1
	; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
	; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63:
	; AVX2: # BB#0:
	; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
	; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA
	; AVX512VL-NEXT: kmovd %eax, %k1
	; AVX512VL-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1}
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 33, i32 2, i32 35, i32 4, i32 37, i32 6, i32 39, i32 8, i32 41, i32 10, i32 43, i32 12, i32 45, i32 14, i32 47, i32 16, i32 49, i32 18, i32 51, i32 20, i32 53, i32 22, i32 55, i32 24, i32 57, i32 26, i32 59, i32 28, i32 61, i32 30, i32 63>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31:
	; AVX1: # BB#0:
	; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
	; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0
	; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
	; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31:
	; AVX2: # BB#0:
	; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
	; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA
	; AVX512VL-NEXT: kmovd %eax, %k1
	; AVX512VL-NEXT: vpblendmb %ymm0, %ymm1, %ymm0 {%k1}
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 32, i32 1, i32 34, i32 3, i32 36, i32 5, i32 38, i32 7, i32 40, i32 9, i32 42, i32 11, i32 44, i32 13, i32 46, i32 15, i32 48, i32 17, i32 50, i32 19, i32 52, i32 21, i32 54, i32 23, i32 56, i32 25, i32 58, i32 27, i32 60, i32 29, i32 62, i32 31>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31(<32 x i8> %a) {
	; AVX1OR2-LABEL: shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31:
	; AVX1OR2: # BB#0:
	; AVX1OR2-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
	; AVX1OR2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA
	; AVX512VL-NEXT: kmovd %eax, %k1
	; AVX512VL-NEXT: vmovdqu8 %ymm0, %ymm0 {%k1} {z}
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 32, i32 1, i32 34, i32 3, i32 36, i32 5, i32 38, i32 7, i32 40, i32 9, i32 42, i32 11, i32 44, i32 13, i32 46, i32 15, i32 48, i32 17, i32 50, i32 19, i32 52, i32 21, i32 54, i32 23, i32 56, i32 25, i32 58, i32 27, i32 60, i32 29, i32 62, i32 31>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15_u6_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31(<32 x i8> %a) {
	; AVX1-LABEL: shuffle_v32i8_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15_u6_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[1],zero,xmm0[2],zero,xmm0[4,u,6,7,8,9,10,11,12,13,14,15]
	; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15_u6_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,ymm0[2],zero,ymm0[4,u,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 1, i32 32, i32 2, i32 32, i32 4, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
	; AVX2OR512VL-NEXT: vpbroadcastw %xmm0, %ymm0
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
	; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
	; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
	; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
	; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
	; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
	; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
	; AVX512VL-NEXT: vpxor %ymm2, %ymm2, %ymm2
	; AVX512VL-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA
	; AVX512VL-NEXT: kmovd %eax, %k1
	; AVX512VL-NEXT: vpshufb %ymm2, %ymm1, %ymm0 {%k1}
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_48_48_48_48_48_48_48_48_24_25_26_27_28_29_30_31(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_48_48_48_48_48_48_48_48_24_25_26_27_28_29_30_31:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
	; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7]
	; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
	; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
	; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_48_48_48_48_48_48_48_48_24_25_26_27_28_29_30_31:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpxor %ymm2, %ymm2, %ymm2
	; AVX2OR512VL-NEXT: vpshufb %ymm2, %ymm1, %ymm1
	; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_55_54_53_52_51_50_49_48_31_30_29_28_27_26_25_24(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_55_54_53_52_51_50_49_48_31_30_29_28_27_26_25_24:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
	; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <15,14,13,12,11,10,9,8,u,u,u,u,u,u,u,u>
	; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
	; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = <7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u>
	; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm4
	; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm2[0]
	; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
	; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm1
	; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_55_54_53_52_51_50_49_48_31_30_29_28_27_26_25_24:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_39_38_37_36_35_34_33_32_07_06_05_04_03_02_01_00_55_54_53_52_51_50_49_48_23_22_21_20_19_18_17_16(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_07_06_05_04_03_02_01_00_55_54_53_52_51_50_49_48_23_22_21_20_19_18_17_16:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
	; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
	; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1]
	; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
	; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
	; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_07_06_05_04_03_02_01_00_55_54_53_52_51_50_49_48_23_22_21_20_19_18_17_16:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u,23,22,21,20,19,18,17,16]
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u,23,22,21,20,19,18,17,16,u,u,u,u,u,u,u,u]
	; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_17_16(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_17_16:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]
	; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_17_16:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,17,16]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 16>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_18_16_16(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_18_16_16:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0]
	; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_18_16_16:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,18,16,16]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 18, i32 16, i32 16>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_23_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_23_16_16_16_16_16_16_16:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0]
	; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_23_16_16_16_16_16_16_16:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,23,16,16,16,16,16,16,16]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 23, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_24_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_24_16_16_16_16_16_16_16_16:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
	; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_24_16_16_16_16_16_16_16_16:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,24,16,16,16,16,16,16,16,16]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 24, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_30_16_16_16_16_16_16_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_30_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
	; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_30_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,30,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 14, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 30, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_31_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_31_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
	; AVX1: # BB#0:
	; AVX1-NEXT: movl $15, %eax
	; AVX1-NEXT: vmovd %eax, %xmm1
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
	; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm2
	; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_31_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,31,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 31, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
	; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
	; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
	; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
	; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
	; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
	; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,u,1,u,2,u,3,u,4,u,5,u,6,u,7,u,24,u,25,u,26,u,27,u,28,u,29,u,30,u,31,u]
	; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,0,u,1,u,2,u,3,u,4,u,5,u,6,u,7,u,24,u,25,u,26,u,27,u,28,u,29,u,30,u,31]
	; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
	; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,u,1,u,2,u,3,u,4,u,5,u,6,u,7,u,24,u,25,u,26,u,27,u,28,u,29,u,30,u,31,u]
	; AVX512VL-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA
	; AVX512VL-NEXT: kmovd %eax, %k1
	; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm1[u,0,u,1,u,2,u,3,u,4,u,5,u,6,u,7,u,24,u,25,u,26,u,27,u,28,u,29,u,30,u,31]
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
	; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
	; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,u,9,u,10,u,11,u,12,u,13,u,14,u,15,u,16,u,17,u,18,u,19,u,20,u,21,u,22,u,23,u]
	; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,8,u,9,u,10,u,11,u,12,u,13,u,14,u,15,u,16,u,17,u,18,u,19,u,20,u,21,u,22,u,23]
	; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
	; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,u,9,u,10,u,11,u,12,u,13,u,14,u,15,u,16,u,17,u,18,u,19,u,20,u,21,u,22,u,23,u]
	; AVX512VL-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA
	; AVX512VL-NEXT: kmovd %eax, %k1
	; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm1[u,8,u,9,u,10,u,11,u,12,u,13,u,14,u,15,u,16,u,17,u,18,u,19,u,20,u,21,u,22,u,23]
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_16_17_16_16_16_16_16_16_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_16_17_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_16_17_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,16,17,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 16, i32 17, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_16_16_18_16_16_16_16_16_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_16_16_18_16_16_16_16_16_16_16_16_16_16_16_16_16:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0]
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_16_16_18_16_16_16_16_16_16_16_16_16_16_16_16_16:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,16,16,18,16,16,16,16,16,16,16,16,16,16,16,16,16]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 16, i32 16, i32 18, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_16_16_16_16_16_16_16_23_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_16_16_16_16_16_16_16_23_16_16_16_16_16_16_16_16:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0]
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_16_16_16_16_16_16_16_23_16_16_16_16_16_16_16_16:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,16,16,16,16,16,16,16,23,16,16,16,16,16,16,16,16]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 23, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_24_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_24_16_16_16_16_16_16_16:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,0,0,0,0,0,0,0]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_24_16_16_16_16_16_16_16:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,24,16,16,16,16,16,16,16]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 24, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_30_16(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_30_16:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,14,0]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_30_16:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,30,16]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 14, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 30, i32 16>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_31(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_31:
	; AVX1: # BB#0:
	; AVX1-NEXT: movl $15, %eax
	; AVX1-NEXT: vmovd %eax, %xmm1
	; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_31:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,31]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 31>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12_28_28_28_28_24_24_24_24_20_20_20_20_16_16_16_16(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12_28_28_28_28_24_24_24_24_20_20_20_20_16_16_16_16:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12]
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,12,12,12,8,8,8,8,4,4,4,4,0,0,0,0]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12_28_28_28_28_24_24_24_24_20_20_20_20_16_16_16_16:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12,28,28,28,28,24,24,24,24,20,20,20,20,16,16,16,16]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12, i32 28, i32 28, i32 28, i32 28, i32 24, i32 24, i32 24, i32 24, i32 20, i32 20, i32 20, i32 20, i32 16, i32 16, i32 16, i32 16>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_08_08_08_08_08_08_08_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_08_08_08_08_08_08_08_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,8,8,8,8,8,8,8,0,0,0,0,0,0,0,0]
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_08_08_08_08_08_08_08_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,8,8,8,8,8,8,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_16_16_16_16_uu_uu_uu_uu_uu_16_16_16_16_16_30_16(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_16_16_16_16_uu_uu_uu_uu_uu_16_16_16_16_16_30_16:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,0,0,0,u,u,u,u,u,0,0,0,0,0,14,0]
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_00_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_16_16_16_16_uu_uu_uu_uu_uu_16_16_16_16_16_30_16:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,16,16,16,u,u,u,u,u,16,16,16,16,16,30,16]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 16, i32 16, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 16, i32 16, i32 16, i32 16, i32 30, i32 16>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_uu_14_uu_uu_00_00_00_00_00_00_00_00_00_00_00_00_16_16_uu_16_uu_uu_uu_uu_16_16_16_16_16_16_30_16(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_uu_14_uu_uu_00_00_00_00_00_00_00_00_00_00_00_00_16_16_uu_16_uu_uu_uu_uu_16_16_16_16_16_16_30_16:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[14,14,1,1,0,0,0,0,0,0,0,0,0,0,0,0]
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,u,0,u,u,u,u,0,0,0,0,0,0,14,0]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_uu_14_uu_uu_00_00_00_00_00_00_00_00_00_00_00_00_16_16_uu_16_uu_uu_uu_uu_16_16_16_16_16_16_30_16:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,14,u,u,0,0,0,0,0,0,0,0,0,0,0,0,16,16,u,16,u,u,u,u,16,16,16,16,16,16,30,16]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 undef, i32 14, i32 undef, i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 undef, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 30, i32 16>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_00_00_uu_uu_uu_04_uu_08_08_08_08_uu_uu_12_uu_28_28_28_28_uu_uu_uu_24_20_20_20_20_16_16_16_16(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_00_00_uu_uu_uu_04_uu_08_08_08_08_uu_uu_12_uu_28_28_28_28_uu_uu_uu_24_20_20_20_20_16_16_16_16:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12]
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,12,12,12,8,8,8,8,4,4,4,4,0,0,0,0]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_uu_uu_uu_04_uu_08_08_08_08_uu_uu_12_uu_28_28_28_28_uu_uu_uu_24_20_20_20_20_16_16_16_16:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,u,u,u,4,u,8,8,8,8,u,u,12,u,28,28,28,28,u,u,u,24,20,20,20,20,16,16,16,16]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 4, i32 undef, i32 8, i32 8, i32 8, i32 8, i32 undef, i32 undef, i32 12, i32 undef, i32 28, i32 28, i32 28, i32 28, i32 undef, i32 undef, i32 undef, i32 24, i32 20, i32 20, i32 20, i32 20, i32 16, i32 16, i32 16, i32 16>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_08_08_08_08_08_08_08_08_uu_uu_uu_uu_uu_uu_uu_uu_16_16_16_uu_uu_uu_uu_uu_uu_uu_24_24_24_24_24_24(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_08_08_08_08_08_08_08_08_uu_uu_uu_uu_uu_uu_uu_uu_16_16_16_uu_uu_uu_uu_uu_uu_uu_24_24_24_24_24_24:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_08_08_08_08_08_08_08_08_uu_uu_uu_uu_uu_uu_uu_uu_16_16_16_uu_uu_uu_uu_uu_uu_uu_24_24_24_24_24_24:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,8,8,8,8,8,8,8,u,u,u,u,u,u,u,u,16,16,16,u,u,u,u,u,u,u,24,24,24,24,24,24]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 16, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
	; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,4,u,1,6],zero,zero,xmm2[0],zero,xmm2[11,u],zero,zero,zero,zero
	; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[u,u],zero,xmm1[u],zero,zero,xmm1[5,0],zero,xmm1[10],zero,xmm1[u,4,2,4,7]
	; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
	; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[8,6,u,6,u,u,u,u,u,u,u,15,u,u,u,u]
	; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,0,255,0,255,255,255,255,255,255,255,0,255,255,255,255]
	; AVX1-NEXT: vpblendvb %xmm6, %xmm3, %xmm5, %xmm3
	; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[u,u],zero,zero,xmm2[12],zero,xmm2[u,u,u],zero,zero,xmm2[u,0,3]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,13,u,u,3,3],zero,xmm1[8,u,u,u,12,1,u],zero,zero
	; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u],zero,zero,xmm4[u,u,u,u,1,6,13,u,u],zero,xmm4[u,u]
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13,u,u,u,u],zero,zero,zero,xmm0[u,u,12,u,u]
	; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
	; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,0,0,255,255,255,255,0,0,0,255,255,0,255,255]
	; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39:
	; AVX2: # BB#0:
	; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
	; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,12,u,u,u,u,u,u,u,0,3,u,u,u,u,u,u,21,16,u,26,u,u,20,18,20,23]
	; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[10,13,u,u,3,3,u,8,u,u,u,12,1,u,u,u,u,u,20,u,17,22,u,u,16,u,27,u,u,u,u,u]
	; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,u,u,255,255,0,255,u,u,u,255,255,u,0,0,u,u,255,u,255,255,0,0,255,0,255,u,0,0,0,0>
	; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
	; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
	; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,1,6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,23,u,u,u,u]
	; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,12,13,u,u,u,u,u,u,u,u,u,12,u,u,20,19,u,19,u,u,u,u,u,u,u,u,u,u,u,u]
	; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4,5],ymm2[6],ymm0[7]
	; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,0,0,0,255,255,0,255,255,0,0,255,0,255,255,255,255,255,255,255,0,255,255,255,255]
	; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
	; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[10,13,u,u,3,3,u,8,u,u,u,12,1,u,u,u,u,u,20,u,17,22,u,u,16,u,27,u,u,u,u,u]
	; AVX512VL-NEXT: movl $-222248896, %eax # imm = 0xF2C0C040
	; AVX512VL-NEXT: kmovd %eax, %k1
	; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm2[u,u,u,u,u,u,12,u,u,u,u,u,u,u,0,3,u,u,u,u,u,u,21,16,u,26,u,u,20,18,20,23]
	; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
	; AVX512VL-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,1,6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,23,u,u,u,u]
	; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,12,13,u,u,u,u,u,u,u,u,u,12,u,u,20,19,u,19,u,u,u,u,u,u,u,u,u,u,u,u]
	; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4,5],ymm2[6],ymm0[7]
	; AVX512VL-NEXT: movl $134948620, %eax # imm = 0x80B270C
	; AVX512VL-NEXT: kmovd %eax, %k1
	; AVX512VL-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
	; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 42, i32 45, i32 12, i32 13, i32 35, i32 35, i32 60, i32 40, i32 17, i32 22, i32 29, i32 44, i32 33, i32 12, i32 48, i32 51, i32 20, i32 19, i32 52, i32 19, i32 49, i32 54, i32 37, i32 32, i32 48, i32 42, i32 59, i32 7, i32 36, i32 34, i32 36, i32 39>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40:
	; AVX1: # BB#0:
	; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
	; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40:
	; AVX1: # BB#0:
	; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
	; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
	; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
	; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
	; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
	; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
	; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
	; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
	; AVX2OR512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
	; AVX2OR512VL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_32_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_48(<32 x i8> %a) {
	; AVX1-LABEL: shuffle_v32i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_32_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_48:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_32_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_48:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 32, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 48>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_47_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_63_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(<32 x i8> %a) {
	; AVX1-LABEL: shuffle_v32i8_47_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_63_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpsrldq {{.*#+}} xmm1 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_47_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_63_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> <i32 47, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 63, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <32 x i8> %shuffle
	}

	;
	; Shuffle to logical bit shifts
	;

	define <32 x i8> @shuffle_v32i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14_zz_16_zz_18_zz_20_zz_22_zz_24_zz_26_zz_28_zz_30(<32 x i8> %a) {
	; AVX1-LABEL: shuffle_v32i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14_zz_16_zz_18_zz_20_zz_22_zz_24_zz_26_zz_28_zz_30:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpsllw $8, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14_zz_16_zz_18_zz_20_zz_22_zz_24_zz_26_zz_28_zz_30:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpsllw $8, %ymm0, %ymm0
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 32, i32 0, i32 32, i32 2, i32 32, i32 4, i32 32, i32 6, i32 32, i32 8, i32 32, i32 10, i32 32, i32 12, i32 32, i32 14, i32 32, i32 16, i32 32, i32 18, i32 32, i32 20, i32 32, i32 22, i32 32, i32 24, i32 32, i32 26, i32 32, i32 28, i32 32, i32 30>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_zz_zz_00_01_zz_zz_04_05_zz_zz_08_09_zz_zz_12_13_zz_zz_16_17_zz_zz_20_21_zz_zz_24_25_zz_zz_28_29(<32 x i8> %a) {
	; AVX1-LABEL: shuffle_v32i8_zz_zz_00_01_zz_zz_04_05_zz_zz_08_09_zz_zz_12_13_zz_zz_16_17_zz_zz_20_21_zz_zz_24_25_zz_zz_28_29:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpslld $16, %xmm0, %xmm1
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpslld $16, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_zz_zz_00_01_zz_zz_04_05_zz_zz_08_09_zz_zz_12_13_zz_zz_16_17_zz_zz_20_21_zz_zz_24_25_zz_zz_28_29:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpslld $16, %ymm0, %ymm0
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 32, i32 32, i32 0, i32 1, i32 32, i32 32, i32 4, i32 5, i32 32, i32 32, i32 8, i32 9, i32 32, i32 32, i32 12, i32 13, i32 32, i32 32, i32 16, i32 17, i32 32, i32 32, i32 20, i32 21, i32 32, i32 32, i32 24, i32 25, i32 32, i32 32, i32 28, i32 29>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_zz_zz_zz_zz_zz_zz_00_01_zz_zz_zz_zz_zz_zz_08_09_zz_zz_zz_zz_zz_zz_16_17_zz_zz_zz_zz_zz_zz_24_25(<32 x i8> %a) {
	; AVX1-LABEL: shuffle_v32i8_zz_zz_zz_zz_zz_zz_00_01_zz_zz_zz_zz_zz_zz_08_09_zz_zz_zz_zz_zz_zz_16_17_zz_zz_zz_zz_zz_zz_24_25:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpsllq $48, %xmm0, %xmm1
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpsllq $48, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_zz_zz_zz_zz_zz_zz_00_01_zz_zz_zz_zz_zz_zz_08_09_zz_zz_zz_zz_zz_zz_16_17_zz_zz_zz_zz_zz_zz_24_25:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpsllq $48, %ymm0, %ymm0
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 0, i32 1, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 8, i32 9, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 16, i32 17, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 24, i32 25>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31_zz(<32 x i8> %a) {
	; AVX1-LABEL: shuffle_v32i8_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31_zz:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31_zz:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 1, i32 32, i32 3, i32 32, i32 5, i32 32, i32 7, i32 32, i32 9, i32 32, i32 11, i32 32, i32 13, i32 32, i32 15, i32 32, i32 17, i32 32, i32 19, i32 32, i32 21, i32 32, i32 23, i32 32, i32 25, i32 32, i32 27, i32 32, i32 29, i32 32, i32 31, i32 32>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz_18_19_zz_zz_22_23_zz_zz_26_27_zz_zz_30_31_zz_zz(<32 x i8> %a) {
	; AVX1-LABEL: shuffle_v32i8_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz_18_19_zz_zz_22_23_zz_zz_26_27_zz_zz_30_31_zz_zz:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz_18_19_zz_zz_22_23_zz_zz_26_27_zz_zz_30_31_zz_zz:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpsrld $16, %ymm0, %ymm0
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 2, i32 3, i32 32, i32 32, i32 6, i32 7, i32 32, i32 32, i32 10, i32 11, i32 32, i32 32, i32 14, i32 15, i32 32, i32 32, i32 18, i32 19, i32 32, i32 32, i32 22, i32 23, i32 32, i32 32, i32 26, i32 27, i32 32, i32 32, i32 30, i32 31, i32 32, i32 32>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_07_zz_zz_zz_zz_zz_zz_zz_15_zz_zz_zz_zz_z_zz_zz_23_zz_zz_zz_zz_zz_zz_zz_31_zz_zz_zz_zz_zz_zz_zz(<32 x i8> %a) {
	; AVX1-LABEL: shuffle_v32i8_07_zz_zz_zz_zz_zz_zz_zz_15_zz_zz_zz_zz_z_zz_zz_23_zz_zz_zz_zz_zz_zz_zz_31_zz_zz_zz_zz_zz_zz_zz:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpsrlq $56, %xmm0, %xmm1
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpsrlq $56, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_07_zz_zz_zz_zz_zz_zz_zz_15_zz_zz_zz_zz_z_zz_zz_23_zz_zz_zz_zz_zz_zz_zz_31_zz_zz_zz_zz_zz_zz_zz:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpsrlq $56, %ymm0, %ymm0
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 7, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 15, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 23, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_32_zz_zz_zz_zz_zz_zz_zz_33_zz_zz_zz_zz_zz_zz_zz_34_zz_zz_zz_zz_zz_zz_zz_35_zz_zz_zz_zz_zz_zz_zz(<32 x i8> %a) {
	; AVX1-LABEL: shuffle_v32i8_32_zz_zz_zz_zz_zz_zz_zz_33_zz_zz_zz_zz_zz_zz_zz_34_zz_zz_zz_zz_zz_zz_zz_35_zz_zz_zz_zz_zz_zz_zz:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
	; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
	; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_32_zz_zz_zz_zz_zz_zz_zz_33_zz_zz_zz_zz_zz_zz_zz_34_zz_zz_zz_zz_zz_zz_zz_35_zz_zz_zz_zz_zz_zz_zz:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> <i32 32, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 33, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 34, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 35, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_36_zz_zz_zz_37_zz_zz_zz_38_zz_zz_zz_39_zz_zz_zz(<32 x i8> %a) {
	; AVX1-LABEL: shuffle_v32i8_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_36_zz_zz_zz_37_zz_zz_zz_38_zz_zz_zz_39_zz_zz_zz:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
	; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
	; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_36_zz_zz_zz_37_zz_zz_zz_38_zz_zz_zz_39_zz_zz_zz:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> <i32 32, i32 0, i32 0, i32 0, i32 33, i32 0, i32 0, i32 0, i32 34, i32 0, i32 0, i32 0, i32 35, i32 0, i32 0, i32 0, i32 36, i32 0, i32 0, i32 0, i32 37, i32 0, i32 0, i32 0, i32 38, i32 0, i32 0, i32 0, i32 39, i32 0, i32 0, i32 0>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_32_zz_33_zz_34_zz_35_zz_36_zz_37_zz_38_zz_39_zz_40_zz_41_zz_42_zz_43_zz_44_zz_45_zz_46_zz_47_zz(<32 x i8> %a) {
	; AVX1-LABEL: shuffle_v32i8_32_zz_33_zz_34_zz_35_zz_36_zz_37_zz_38_zz_39_zz_40_zz_41_zz_42_zz_43_zz_44_zz_45_zz_46_zz_47_zz:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
	; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_32_zz_33_zz_34_zz_35_zz_36_zz_37_zz_38_zz_39_zz_40_zz_41_zz_42_zz_43_zz_44_zz_45_zz_46_zz_47_zz:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> <i32 32, i32 0, i32 33, i32 0, i32 34, i32 0, i32 35, i32 0, i32 36, i32 0, i32 37, i32 0, i32 38, i32 0, i32 39, i32 0, i32 40, i32 0, i32 41, i32 0, i32 42, i32 0, i32 43, i32 0, i32 44, i32 0, i32 45, i32 0, i32 46, i32 0, i32 47, i32 0>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz_zz_60_zz_zz_zz_61_zz_zz_zz_62_zz_zz_zz_63_zz_zz_zz(<32 x i8> %a) {
	; AVX1-LABEL: shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz_zz_60_zz_zz_zz_61_zz_zz_zz_62_zz_zz_zz_63_zz_zz_zz:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
	; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
	; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
	; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz_zz_60_zz_zz_zz_61_zz_zz_zz_62_zz_zz_zz_63_zz_zz_zz:
	; AVX2: # BB#0:
	; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
	; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
	; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
	; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
	; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
	; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
	; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz_zz_60_zz_zz_zz_61_zz_zz_zz_62_zz_zz_zz_63_zz_zz_zz:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
	; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
	; AVX512VL-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
	; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
	; AVX512VL-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
	; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
	; AVX512VL-NEXT: movl $286331153, %eax # imm = 0x11111111
	; AVX512VL-NEXT: kmovd %eax, %k1
	; AVX512VL-NEXT: vmovdqu8 %ymm0, %ymm0 {%k1} {z}
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> <i32 56, i32 1, i32 2, i32 3, i32 57, i32 5, i32 6, i32 7, i32 58, i32 9, i32 10, i32 11, i32 59, i32 13, i32 14, i32 15, i32 60, i32 17, i32 18, i32 19, i32 61, i32 21, i32 22, i32 23, i32 62, i32 25, i32 26, i32 27, i32 63, i32 29, i32 30, i32 31>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_47_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_63_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_47_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_63_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
	; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[15],xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
	; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_47_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_63_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm1[31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 47, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 63, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_uu_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_63_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_uu_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_63_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
	; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
	; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_uu_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_63_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm1[31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 63, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_47_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_uu_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_47_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_uu_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_47_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_uu_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm1[31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 47, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 undef, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_uu_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_63_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_uu_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_63_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
	; AVX1-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
	; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_uu_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_63_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm1[31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_63_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_63_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_63_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm1[31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 63, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_32_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_48(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_32_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_48:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
	; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0]
	; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_32_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_48:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0],ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47_00_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63_16(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47_00_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63_16:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
	; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0]
	; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47_00_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63_16:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0],ymm1[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 00, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 16>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_15_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_31_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_15_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_31_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
	; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[15],xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
	; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_15_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_31_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm0[31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_16(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_16:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0]
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_16:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,16]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,31,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v32i8_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]
	; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v32i8_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
	; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
	; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
	; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
	; AVX2OR512VL-NEXT: vpbroadcastb %xmm0, %ymm0
	; AVX2OR512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
	; AVX1: # BB#0:
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,12,12,13,13,14,14,15,15]
	; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
	; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
	; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
	; AVX2: # BB#0:
	; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
	; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,12,12,13,13,14,14,15,15]
	; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vpbroadcastb %xmm1, %xmm1
	; AVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
	; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
	; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
	; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<32 x i8> %a, <32 x i8> %b) {
	; AVX1OR2-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
	; AVX1OR2: # BB#0:
	; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
	; AVX1OR2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
	; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
	; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
	ret <32 x i8> %shuffle
	}

	define <32 x i8> @shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<32 x i8> %a, <32 x i8> %b) {
	; AVX1-LABEL: shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
	; AVX2: # BB#0:
	; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
	; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
	; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
	; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
	; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
	; AVX512VL-NEXT: retq
	%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
	ret <32 x i8> %shuffle
	}

	define <4 x i64> @PR28136(<32 x i8> %a0, <32 x i8> %a1) {
	; AVX1-LABEL: PR28136:
	; AVX1: # BB#0:
	; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
	; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
	; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,10,10,12,12,14,14,9,9,11,11,13,13,15,15]
	; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm3
	; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
	; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
	; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm2
	; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
	; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm2, %xmm2
	; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,0,2,2,4,4,6,6,1,1,3,3,5,5,7,7]
	; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
	; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
	; AVX1-NEXT: vpblendvb %xmm4, %xmm0, %xmm1, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: PR28136:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
	; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
	; AVX2OR512VL-NEXT: retq
	%1 = shufflevector <32 x i8> %a0, <32 x i8> %a1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50,i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
	%2 = bitcast <32 x i8> %1 to <4 x i64>
	%3 = shufflevector <4 x i64> %2, <4 x i64> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
	ret <4 x i64> %3
	}

	define <32 x i8> @insert_dup_mem_v32i8_i32(i32* %ptr) {
	; AVX1-LABEL: insert_dup_mem_v32i8_i32:
	; AVX1: # BB#0:
	; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
	; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: insert_dup_mem_v32i8_i32:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpbroadcastb (%rdi), %ymm0
	; AVX2OR512VL-NEXT: retq
	%tmp = load i32, i32* %ptr, align 4
	%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
	%tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8>
	%tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <32 x i32> zeroinitializer
	ret <32 x i8> %tmp3
	}

	define <32 x i8> @insert_dup_mem_v32i8_sext_i8(i8* %ptr) {
	; AVX1-LABEL: insert_dup_mem_v32i8_sext_i8:
	; AVX1: # BB#0:
	; AVX1-NEXT: movsbl (%rdi), %eax
	; AVX1-NEXT: vmovd %eax, %xmm0
	; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: insert_dup_mem_v32i8_sext_i8:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpbroadcastb (%rdi), %ymm0
	; AVX2OR512VL-NEXT: retq
	%tmp = load i8, i8* %ptr, align 1
	%tmp1 = sext i8 %tmp to i32
	%tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0
	%tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8>
	%tmp4 = shufflevector <16 x i8> %tmp3, <16 x i8> undef, <32 x i32> zeroinitializer
	ret <32 x i8> %tmp4
	}

	define <32 x i8> @insert_dup_elt1_mem_v32i8_i32(i32* %ptr) {
	; AVX1-LABEL: insert_dup_elt1_mem_v32i8_i32:
	; AVX1: # BB#0:
	; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: insert_dup_elt1_mem_v32i8_i32:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpbroadcastb 1(%rdi), %ymm0
	; AVX2OR512VL-NEXT: retq
	%tmp = load i32, i32* %ptr, align 4
	%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
	%tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8>
	%tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
	ret <32 x i8> %tmp3
	}

	define <32 x i8> @insert_dup_elt3_mem_v32i8_i32(i32* %ptr) {
	; AVX1-LABEL: insert_dup_elt3_mem_v32i8_i32:
	; AVX1: # BB#0:
	; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2OR512VL-LABEL: insert_dup_elt3_mem_v32i8_i32:
	; AVX2OR512VL: # BB#0:
	; AVX2OR512VL-NEXT: vpbroadcastb 3(%rdi), %ymm0
	; AVX2OR512VL-NEXT: retq
	%tmp = load i32, i32* %ptr, align 4
	%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
	%tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8>
	%tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
	ret <32 x i8> %tmp3
	}

	define <32 x i8> @insert_dup_elt1_mem_v32i8_sext_i8(i8* %ptr) {
	; AVX1-LABEL: insert_dup_elt1_mem_v32i8_sext_i8:
	; AVX1: # BB#0:
	; AVX1-NEXT: movsbl (%rdi), %eax
	; AVX1-NEXT: vmovd %eax, %xmm0
	; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
	; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; AVX1-NEXT: retq
	;
	; AVX2-LABEL: insert_dup_elt1_mem_v32i8_sext_i8:
	; AVX2: # BB#0:
	; AVX2-NEXT: movsbl (%rdi), %eax
	; AVX2-NEXT: shrl $8, %eax
	; AVX2-NEXT: vmovd %eax, %xmm0
	; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
	; AVX2-NEXT: retq
	;
	; AVX512VL-LABEL: insert_dup_elt1_mem_v32i8_sext_i8:
	; AVX512VL: # BB#0:
	; AVX512VL-NEXT: movsbl (%rdi), %eax
	; AVX512VL-NEXT: shrl $8, %eax
	-; AVX512VL-NEXT: vpbroadcastb %al, %ymm0
	+; AVX512VL-NEXT: vpbroadcastb %eax, %ymm0
	; AVX512VL-NEXT: retq
	%tmp = load i8, i8* %ptr, align 1
	%tmp1 = sext i8 %tmp to i32
	%tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0
	%tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8>
	%tmp4 = shufflevector <16 x i8> %tmp3, <16 x i8> undef, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
	ret <32 x i8> %tmp4
	}
	diff --git a/test/CodeGen/X86/vector-shuffle-512-v32.ll b/test/CodeGen/X86/vector-shuffle-512-v32.ll
	index 7a5c992bb829..b8fc27ba5515 100644
	--- a/test/CodeGen/X86/vector-shuffle-512-v32.ll
	+++ b/test/CodeGen/X86/vector-shuffle-512-v32.ll
	@@ -1,376 +1,376 @@
	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl \| FileCheck --check-prefixes=ALL,KNL %s
	; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx \| FileCheck --check-prefixes=ALL,SKX %s

	target triple = "x86_64-unknown-unknown"

	define <32 x i16> @shuffle_v32i16(<32 x i16> %a) {
	; KNL-LABEL: shuffle_v32i16:
	; KNL: ## BB#0:
	; KNL-NEXT: vpbroadcastw %xmm0, %ymm0
	; KNL-NEXT: vmovdqa %ymm0, %ymm1
	; KNL-NEXT: retq
	;
	; SKX-LABEL: shuffle_v32i16:
	; SKX: ## BB#0:
	; SKX-NEXT: vpbroadcastw %xmm0, %zmm0
	; SKX-NEXT: retq
	%c = shufflevector <32 x i16> %a, <32 x i16> undef, <32 x i32> zeroinitializer
	ret <32 x i16> %c
	}

	define <32 x i16> @shuffle_v32i16_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08(<32 x i16> %a) {
	; KNL-LABEL: shuffle_v32i16_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08:
	; KNL: ## BB#0:
	; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
	; KNL-NEXT: vpbroadcastw %xmm0, %ymm0
	; KNL-NEXT: vmovdqa %ymm0, %ymm1
	; KNL-NEXT: retq
	;
	; SKX-LABEL: shuffle_v32i16_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08:
	; SKX: ## BB#0:
	; SKX-NEXT: vextracti32x4 $1, %zmm0, %xmm0
	; SKX-NEXT: vpbroadcastw %xmm0, %zmm0
	; SKX-NEXT: retq
	%c = shufflevector <32 x i16> %a, <32 x i16> undef, <32 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
	ret <32 x i16> %c
	}

	define <32 x i16> @shuffle_v32i16_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_1f(<32 x i16> %a) {
	; KNL-LABEL: shuffle_v32i16_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_1f:
	; KNL: ## BB#0:
	; KNL-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
	; KNL-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,1,10,11,8,9,8,9,14,15,2,3,4,5,2,3,16,17,26,27,24,25,24,25,30,31,18,19,20,21,18,19]
	; KNL-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[4,5,10,11,4,5,6,7,14,15,2,3,4,5,2,3,20,21,26,27,20,21,22,23,30,31,18,19,20,21,18,19]
	; KNL-NEXT: vmovdqa {{.*#+}} ymm0 = <0,0,0,0,u,u,u,u,0,0,u,u,255,255,0,0,255,255,255,255,u,u,255,255,255,255,u,u,0,0,255,255>
	; KNL-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm0
	; KNL-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,10,11,8,9,8,9,14,15,6,7,4,5,14,15,16,17,26,27,24,25,24,25,30,31,22,23,20,21,30,31]
	; KNL-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,u,u,u,u,255,255,u,u,0,0,255,255,0,0,0,0,u,u,0,0,0,0,u,u,255,255,u,u>
	; KNL-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
	; KNL-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,u,u,u,u,255,255,u,u,255,255,255,255,255,255,255,255,u,u,255,255,255,255,u,u,255,255,0,0>
	; KNL-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
	; KNL-NEXT: retq
	;
	; SKX-LABEL: shuffle_v32i16_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_1f:
	; SKX: ## BB#0:
	; SKX-NEXT: vmovdqu16 {{.*#+}} zmm1 = <2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,1,2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,31>
	; SKX-NEXT: vpermw %zmm0, %zmm1, %zmm0
	; SKX-NEXT: retq
	%c = shufflevector <32 x i16> %a, <32 x i16> undef, <32 x i32> <i32 2, i32 5, i32 undef, i32 undef, i32 7, i32 undef, i32 10, i32 1, i32 0, i32 5, i32 undef, i32 4, i32 7, i32 undef, i32 10, i32 1, i32 2, i32 5, i32 undef, i32 undef, i32 7, i32 undef, i32 10, i32 1, i32 0, i32 5, i32 undef, i32 4, i32 7, i32 undef, i32 10, i32 31>
	ret <32 x i16> %c
	}

	define <32 x i16> @shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_38(<32 x i16> %a, <32 x i16> %b) {
	; KNL-LABEL: shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_38:
	; KNL: ## BB#0:
	; KNL-NEXT: vextracti128 $1, %ymm1, %xmm2
	; KNL-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
	; KNL-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[8,9,12,13,12,13,10,11,0,1,4,5,4,5,0,1]
	; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,1,0,3]
	; KNL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,3,2,2,4,5,6,7]
	; KNL-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm1
	; KNL-NEXT: vextracti128 $1, %ymm0, %xmm5
	; KNL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
	; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,10,11,8,9,14,15,4,5,2,3,2,3,6,7]
	; KNL-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,4,5,6,7,2,3,2,3,0,1,14,15]
	; KNL-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0
	; KNL-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
	; KNL-NEXT: vextracti128 $1, %ymm3, %xmm3
	; KNL-NEXT: vpbroadcastw %xmm3, %ymm3
	; KNL-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0]
	; KNL-NEXT: vpblendvb %ymm5, %ymm1, %ymm3, %ymm1
	; KNL-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,4]
	; KNL-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
	; KNL-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
	; KNL-NEXT: retq
	;
	; SKX-LABEL: shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_38:
	; SKX: ## BB#0:
	; SKX-NEXT: vmovdqu16 {{.*#+}} zmm2 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24,15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,56]
	; SKX-NEXT: vpermt2w %zmm1, %zmm2, %zmm0
	; SKX-NEXT: retq
	%c = shufflevector <32 x i16> %a, <32 x i16> %b, <32 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24, i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 56>
	ret <32 x i16> %c
	}

	define <32 x i16> @shuffle_v16i32_0_32_1_33_2_34_3_35_8_40_9_41_u_u_u_u(<32 x i16> %a, <32 x i16> %b) {
	; KNL-LABEL: shuffle_v16i32_0_32_1_33_2_34_3_35_8_40_9_41_u_u_u_u:
	; KNL: ## BB#0:
	; KNL-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11]
	; KNL-NEXT: retq
	;
	; SKX-LABEL: shuffle_v16i32_0_32_1_33_2_34_3_35_8_40_9_41_u_u_u_u:
	; SKX: ## BB#0:
	; SKX-NEXT: vpunpcklwd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
	; SKX-NEXT: retq
	%c = shufflevector <32 x i16> %a, <32 x i16> %b, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 8, i32 40, i32 9, i32 41, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
	ret <32 x i16> %c
	}

	define <32 x i16> @shuffle_v16i32_4_36_5_37_6_38_7_39_12_44_13_45_u_u_u_u(<32 x i16> %a, <32 x i16> %b) {
	; KNL-LABEL: shuffle_v16i32_4_36_5_37_6_38_7_39_12_44_13_45_u_u_u_u:
	; KNL: ## BB#0:
	; KNL-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15]
	; KNL-NEXT: retq
	;
	; SKX-LABEL: shuffle_v16i32_4_36_5_37_6_38_7_39_12_44_13_45_u_u_u_u:
	; SKX: ## BB#0:
	; SKX-NEXT: vpunpckhwd {{.*#+}} zmm0 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
	; SKX-NEXT: retq
	%c = shufflevector <32 x i16> %a, <32 x i16> %b, <32 x i32> <i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 12, i32 44, i32 13, i32 45, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
	ret <32 x i16> %c
	}

	define <32 x i16> @shuffle_v32i16_1_z_3_z_5_z_7_z_9_z_11_z_13_z_15_z_17_z_19_z_21_z_23_z_25_z_27_z_29_z_31_z(<32 x i16> %a, <32 x i16> %b) {
	; KNL-LABEL: shuffle_v32i16_1_z_3_z_5_z_7_z_9_z_11_z_13_z_15_z_17_z_19_z_21_z_23_z_25_z_27_z_29_z_31_z:
	; KNL: ## BB#0:
	; KNL-NEXT: vpsrld $16, %ymm0, %ymm0
	; KNL-NEXT: vpsrld $16, %ymm1, %ymm1
	; KNL-NEXT: retq
	;
	; SKX-LABEL: shuffle_v32i16_1_z_3_z_5_z_7_z_9_z_11_z_13_z_15_z_17_z_19_z_21_z_23_z_25_z_27_z_29_z_31_z:
	; SKX: ## BB#0:
	; SKX-NEXT: vpsrld $16, %zmm0, %zmm0
	; SKX-NEXT: retq
	%c = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> <i32 1, i32 34, i32 3, i32 34, i32 5, i32 34, i32 7, i32 34, i32 9, i32 34, i32 11, i32 34, i32 13, i32 34, i32 15, i32 34, i32 17, i32 34, i32 19, i32 34, i32 21, i32 34, i32 23, i32 34, i32 25, i32 34, i32 27, i32 34, i32 29, i32 34, i32 31, i32 34>
	ret <32 x i16> %c
	}

	define <32 x i16> @shuffle_v32i16_z_0_z_2_z_4_z_6_z_8_z_10_z_12_z_14_z_16_z_18_z_20_z_22_z_24_z_26_z_28_z_30(<32 x i16> %a, <32 x i16> %b) {
	; KNL-LABEL: shuffle_v32i16_z_0_z_2_z_4_z_6_z_8_z_10_z_12_z_14_z_16_z_18_z_20_z_22_z_24_z_26_z_28_z_30:
	; KNL: ## BB#0:
	; KNL-NEXT: vpslld $16, %ymm0, %ymm0
	; KNL-NEXT: vpslld $16, %ymm1, %ymm1
	; KNL-NEXT: retq
	;
	; SKX-LABEL: shuffle_v32i16_z_0_z_2_z_4_z_6_z_8_z_10_z_12_z_14_z_16_z_18_z_20_z_22_z_24_z_26_z_28_z_30:
	; SKX: ## BB#0:
	; SKX-NEXT: vpslld $16, %zmm0, %zmm0
	; SKX-NEXT: retq
	%c = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> <i32 34, i32 0, i32 34, i32 2, i32 34, i32 4, i32 34, i32 6, i32 34, i32 8, i32 34, i32 10, i32 34, i32 12, i32 34, i32 14, i32 34, i32 16, i32 34, i32 18, i32 34, i32 20, i32 34, i32 22, i32 34, i32 24, i32 34, i32 26, i32 34, i32 28, i32 34, i32 30>
	ret <32 x i16> %c
	}

	define <32 x i16> @shuffle_v32i16_1_1_0_0_4_5_6_7_9_9_8_8_12_13_14_15_17_17_16_16_20_21_22_23_25_25_24_24_28_29_30_31(<32 x i16> %a, <32 x i16> %b) {
	; KNL-LABEL: shuffle_v32i16_1_1_0_0_4_5_6_7_9_9_8_8_12_13_14_15_17_17_16_16_20_21_22_23_25_25_24_24_28_29_30_31:
	; KNL: ## BB#0:
	; KNL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15]
	; KNL-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15]
	; KNL-NEXT: retq
	;
	; SKX-LABEL: shuffle_v32i16_1_1_0_0_4_5_6_7_9_9_8_8_12_13_14_15_17_17_16_16_20_21_22_23_25_25_24_24_28_29_30_31:
	; SKX: ## BB#0:
	; SKX-NEXT: vpshuflw {{.*#+}} zmm0 = zmm0[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15,17,17,16,16,20,21,22,23,25,25,24,24,28,29,30,31]
	; SKX-NEXT: retq
	%c = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> <i32 1, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 9, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15, i32 17, i32 17, i32 16, i32 16, i32 20, i32 21, i32 22, i32 23, i32 25, i32 25, i32 24, i32 24, i32 28, i32 29, i32 30, i32 31>
	ret <32 x i16> %c
	}

	define <32 x i16> @shuffle_v32i16_0_1_2_3_5_5_4_4_8_9_10_11_13_13_12_12_16_17_18_19_21_21_20_20_24_25_26_27_29_29_28_28(<32 x i16> %a, <32 x i16> %b) {
	; KNL-LABEL: shuffle_v32i16_0_1_2_3_5_5_4_4_8_9_10_11_13_13_12_12_16_17_18_19_21_21_20_20_24_25_26_27_29_29_28_28:
	; KNL: ## BB#0:
	; KNL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12]
	; KNL-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12]
	; KNL-NEXT: retq
	;
	; SKX-LABEL: shuffle_v32i16_0_1_2_3_5_5_4_4_8_9_10_11_13_13_12_12_16_17_18_19_21_21_20_20_24_25_26_27_29_29_28_28:
	; SKX: ## BB#0:
	; SKX-NEXT: vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12,16,17,18,19,21,21,20,20,24,25,26,27,29,29,28,28]
	; SKX-NEXT: retq
	%c = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 4, i32 4, i32 8, i32 9, i32 10, i32 11, i32 13, i32 13, i32 12, i32 12, i32 16, i32 17, i32 18, i32 19, i32 21, i32 21, i32 20, i32 20, i32 24, i32 25, i32 26, i32 27, i32 29, i32 29, i32 28, i32 28>
	ret <32 x i16> %c
	}

	define <32 x i16> @shuffle_v32i16_1_1_0_0_5_5_4_4_9_9_11_11_13_13_12_12_17_17_19_19_21_21_20_20_25_25_27_27_29_29_28_28(<32 x i16> %a, <32 x i16> %b) {
	; KNL-LABEL: shuffle_v32i16_1_1_0_0_5_5_4_4_9_9_11_11_13_13_12_12_17_17_19_19_21_21_20_20_25_25_27_27_29_29_28_28:
	; KNL: ## BB#0:
	; KNL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15]
	; KNL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12]
	; KNL-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15]
	; KNL-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12]
	; KNL-NEXT: retq
	;
	; SKX-LABEL: shuffle_v32i16_1_1_0_0_5_5_4_4_9_9_11_11_13_13_12_12_17_17_19_19_21_21_20_20_25_25_27_27_29_29_28_28:
	; SKX: ## BB#0:
	; SKX-NEXT: vpshuflw {{.*#+}} zmm0 = zmm0[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15,17,17,16,16,20,21,22,23,25,25,24,24,28,29,30,31]
	; SKX-NEXT: vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12,16,17,18,19,21,21,20,20,24,25,26,27,29,29,28,28]
	; SKX-NEXT: retq
	%c = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> <i32 1, i32 1, i32 0, i32 0, i32 5, i32 5, i32 4, i32 4, i32 9, i32 9, i32 8, i32 8, i32 13, i32 13, i32 12, i32 12, i32 17, i32 17, i32 16, i32 16, i32 21, i32 21, i32 20, i32 20, i32 25, i32 25, i32 24, i32 24, i32 29, i32 29, i32 28, i32 28>
	ret <32 x i16> %c
	}

	define <32 x i16> @shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz(<32 x i16> %a) {
	; KNL-LABEL: shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
	; KNL: ## BB#0:
	; KNL-NEXT: movl $65535, %eax ## imm = 0xFFFF
	; KNL-NEXT: vmovd %eax, %xmm1
	; KNL-NEXT: vpand %ymm1, %ymm0, %ymm0
	; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1
	; KNL-NEXT: retq
	;
	; SKX-LABEL: shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
	; SKX: ## BB#0:
	; SKX-NEXT: movl $1, %eax
	; SKX-NEXT: kmovd %eax, %k1
	; SKX-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z}
	; SKX-NEXT: retq
	%shuffle = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> <i32 0, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
	ret <32 x i16> %shuffle
	}

	define <32 x i16> @insert_dup_mem_v32i16_i32(i32* %ptr) {
	; KNL-LABEL: insert_dup_mem_v32i16_i32:
	; KNL: ## BB#0:
	; KNL-NEXT: vpbroadcastw (%rdi), %ymm0
	; KNL-NEXT: vmovdqa %ymm0, %ymm1
	; KNL-NEXT: retq
	;
	; SKX-LABEL: insert_dup_mem_v32i16_i32:
	; SKX: ## BB#0:
	; SKX-NEXT: movl (%rdi), %eax
	-; SKX-NEXT: vpbroadcastw %ax, %zmm0
	+; SKX-NEXT: vpbroadcastw %eax, %zmm0
	; SKX-NEXT: retq
	%tmp = load i32, i32* %ptr, align 4
	%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
	%tmp2 = bitcast <4 x i32> %tmp1 to <8 x i16>
	%tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <32 x i32> zeroinitializer
	ret <32 x i16> %tmp3
	}

	define <32 x i16> @insert_dup_mem_v32i16_sext_i16(i16* %ptr) {
	; KNL-LABEL: insert_dup_mem_v32i16_sext_i16:
	; KNL: ## BB#0:
	; KNL-NEXT: movswl (%rdi), %eax
	; KNL-NEXT: vmovd %eax, %xmm0
	; KNL-NEXT: vpbroadcastw %xmm0, %ymm0
	; KNL-NEXT: vmovdqa %ymm0, %ymm1
	; KNL-NEXT: retq
	;
	; SKX-LABEL: insert_dup_mem_v32i16_sext_i16:
	; SKX: ## BB#0:
	; SKX-NEXT: movswl (%rdi), %eax
	-; SKX-NEXT: vpbroadcastw %ax, %zmm0
	+; SKX-NEXT: vpbroadcastw %eax, %zmm0
	; SKX-NEXT: retq
	%tmp = load i16, i16* %ptr, align 2
	%tmp1 = sext i16 %tmp to i32
	%tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0
	%tmp3 = bitcast <4 x i32> %tmp2 to <8 x i16>
	%tmp4 = shufflevector <8 x i16> %tmp3, <8 x i16> undef, <32 x i32> zeroinitializer
	ret <32 x i16> %tmp4
	}

	define <32 x i16> @insert_dup_elt1_mem_v32i16_i32(i32* %ptr) #0 {
	; KNL-LABEL: insert_dup_elt1_mem_v32i16_i32:
	; KNL: ## BB#0:
	; KNL-NEXT: vpbroadcastw 2(%rdi), %ymm0
	; KNL-NEXT: vmovdqa %ymm0, %ymm1
	; KNL-NEXT: retq
	;
	; SKX-LABEL: insert_dup_elt1_mem_v32i16_i32:
	; SKX: ## BB#0:
	; SKX-NEXT: movzwl 2(%rdi), %eax
	-; SKX-NEXT: vpbroadcastw %ax, %zmm0
	+; SKX-NEXT: vpbroadcastw %eax, %zmm0
	; SKX-NEXT: retq
	%tmp = load i32, i32* %ptr, align 4
	%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
	%tmp2 = bitcast <4 x i32> %tmp1 to <8 x i16>
	%tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
	ret <32 x i16> %tmp3
	}

	define <32 x i16> @insert_dup_elt3_mem_v32i16_i32(i32* %ptr) #0 {
	; KNL-LABEL: insert_dup_elt3_mem_v32i16_i32:
	; KNL: ## BB#0:
	; KNL-NEXT: vpbroadcastw 2(%rdi), %ymm0
	; KNL-NEXT: vmovdqa %ymm0, %ymm1
	; KNL-NEXT: retq
	;
	; SKX-LABEL: insert_dup_elt3_mem_v32i16_i32:
	; SKX: ## BB#0:
	; SKX-NEXT: movzwl 2(%rdi), %eax
	-; SKX-NEXT: vpbroadcastw %ax, %zmm0
	+; SKX-NEXT: vpbroadcastw %eax, %zmm0
	; SKX-NEXT: retq
	%tmp = load i32, i32* %ptr, align 4
	%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 1
	%tmp2 = bitcast <4 x i32> %tmp1 to <8 x i16>
	%tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
	ret <32 x i16> %tmp3
	}

	define <32 x i16> @shuffle_v32i16_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_36_zz_zz_zz_37_zz_zz_zz_38_zz_zz_zz_39_zz_zz_zz(<32 x i16> %a) {
	; KNL-LABEL: shuffle_v32i16_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_36_zz_zz_zz_37_zz_zz_zz_38_zz_zz_zz_39_zz_zz_zz:
	; KNL: ## BB#0:
	; KNL-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
	; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
	; KNL-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
	; KNL-NEXT: vmovdqa %ymm2, %ymm0
	; KNL-NEXT: retq
	;
	; SKX-LABEL: shuffle_v32i16_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_36_zz_zz_zz_37_zz_zz_zz_38_zz_zz_zz_39_zz_zz_zz:
	; SKX: ## BB#0:
	; SKX-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
	; SKX-NEXT: retq
	%shuffle = shufflevector <32 x i16> zeroinitializer, <32 x i16> %a, <32 x i32> <i32 32, i32 0, i32 0, i32 0, i32 33, i32 0, i32 0, i32 0, i32 34, i32 0, i32 0, i32 0, i32 35, i32 0, i32 0, i32 0, i32 36, i32 0, i32 0, i32 0, i32 37, i32 0, i32 0, i32 0, i32 38, i32 0, i32 0, i32 0, i32 39, i32 0, i32 0, i32 0>
	ret <32 x i16> %shuffle
	}

	define <32 x i16> @shuffle_v32i16_32_zz_33_zz_34_zz_35_zz_36_zz_37_zz_38_zz_39_zz_40_zz_41_zz_42_zz_43_zz_44_zz_45_zz_46_zz_47_zz(<32 x i16> %a) {
	; KNL-LABEL: shuffle_v32i16_32_zz_33_zz_34_zz_35_zz_36_zz_37_zz_38_zz_39_zz_40_zz_41_zz_42_zz_43_zz_44_zz_45_zz_46_zz_47_zz:
	; KNL: ## BB#0:
	; KNL-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
	; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
	; KNL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
	; KNL-NEXT: vmovdqa %ymm2, %ymm0
	; KNL-NEXT: retq
	;
	; SKX-LABEL: shuffle_v32i16_32_zz_33_zz_34_zz_35_zz_36_zz_37_zz_38_zz_39_zz_40_zz_41_zz_42_zz_43_zz_44_zz_45_zz_46_zz_47_zz:
	; SKX: ## BB#0:
	; SKX-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
	; SKX-NEXT: retq
	%shuffle = shufflevector <32 x i16> zeroinitializer, <32 x i16> %a, <32 x i32> <i32 32, i32 0, i32 33, i32 0, i32 34, i32 0, i32 35, i32 0, i32 36, i32 0, i32 37, i32 0, i32 38, i32 0, i32 39, i32 0, i32 40, i32 0, i32 41, i32 0, i32 42, i32 0, i32 43, i32 0, i32 44, i32 0, i32 45, i32 0, i32 46, i32 0, i32 47, i32 0>
	ret <32 x i16> %shuffle
	}

	define <8 x i16> @pr32967(<32 x i16> %v) {
	; KNL-LABEL: pr32967:
	; KNL: ## BB#0:
	; KNL-NEXT: vextracti128 $1, %ymm1, %xmm2
	; KNL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
	; KNL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
	; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
	; KNL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
	; KNL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
	; KNL-NEXT: vextracti128 $1, %ymm0, %xmm2
	; KNL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
	; KNL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
	; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
	; KNL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
	; KNL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
	; KNL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
	; KNL-NEXT: retq
	;
	; SKX-LABEL: pr32967:
	; SKX: ## BB#0:
	; SKX-NEXT: vpextrw $5, %xmm0, %eax
	; SKX-NEXT: vpextrw $1, %xmm0, %ecx
	; SKX-NEXT: vmovd %ecx, %xmm1
	; SKX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
	; SKX-NEXT: vextracti32x4 $1, %zmm0, %xmm2
	; SKX-NEXT: vpextrw $1, %xmm2, %eax
	; SKX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
	; SKX-NEXT: vpextrw $5, %xmm2, %eax
	; SKX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
	; SKX-NEXT: vextracti32x4 $2, %zmm0, %xmm2
	; SKX-NEXT: vpextrw $1, %xmm2, %eax
	; SKX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
	; SKX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5],xmm1[6,7]
	; SKX-NEXT: vextracti32x4 $3, %zmm0, %xmm0
	; SKX-NEXT: vpextrw $1, %xmm0, %eax
	; SKX-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
	; SKX-NEXT: vpextrw $5, %xmm0, %eax
	; SKX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0
	; SKX-NEXT: vzeroupper
	; SKX-NEXT: retq
	%shuffle = shufflevector <32 x i16> %v, <32 x i16> undef, <8 x i32> <i32 1,i32 5,i32 9,i32 13,i32 17,i32 21,i32 25,i32 29>
	ret <8 x i16> %shuffle
	}
	diff --git a/test/CodeGen/X86/vector-shuffle-512-v64.ll b/test/CodeGen/X86/vector-shuffle-512-v64.ll
	index f4650ec741a7..9dca3191e06b 100644
	--- a/test/CodeGen/X86/vector-shuffle-512-v64.ll
	+++ b/test/CodeGen/X86/vector-shuffle-512-v64.ll
	@@ -1,557 +1,557 @@
	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f \| FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512F
	; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw \| FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
	; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512dq \| FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
	; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512vbmi \| FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VBMI

	target triple = "x86_64-unknown-unknown"

	define <64 x i8> @shuffle_v64i8_02_03_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u(<64 x i8> %a) {
	; AVX512F-LABEL: shuffle_v64i8_02_03_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u:
	; AVX512F: # BB#0:
	; AVX512F-NEXT: vpsrld $16, %xmm0, %xmm0
	; AVX512F-NEXT: retq
	;
	; AVX512BW-LABEL: shuffle_v64i8_02_03_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u:
	; AVX512BW: # BB#0:
	; AVX512BW-NEXT: vpsrld $16, %zmm0, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512DQ-LABEL: shuffle_v64i8_02_03_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u:
	; AVX512DQ: # BB#0:
	; AVX512DQ-NEXT: vpsrld $16, %xmm0, %xmm0
	; AVX512DQ-NEXT: retq
	;
	; AVX512VBMI-LABEL: shuffle_v64i8_02_03_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u:
	; AVX512VBMI: # BB#0:
	; AVX512VBMI-NEXT: vpsrld $16, %zmm0, %zmm0
	; AVX512VBMI-NEXT: retq
	%b = shufflevector <64 x i8> %a, <64 x i8> undef, <64 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
	ret <64 x i8> %b
	}

	define <64 x i8> @shuffle_v64i8_zz_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_zz_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_zz_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_zz_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62(<64 x i8> %a, <64 x i8> %b) {
	; AVX512F-LABEL: shuffle_v64i8_zz_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_zz_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_zz_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_zz_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
	; AVX512F: # BB#0:
	; AVX512F-NEXT: vpslldq {{.*#+}} ymm0 = zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
	; AVX512F-NEXT: vpslldq {{.*#+}} ymm1 = zero,ymm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm1[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
	; AVX512F-NEXT: retq
	;
	; AVX512BW-LABEL: shuffle_v64i8_zz_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_zz_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_zz_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_zz_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
	; AVX512BW: # BB#0:
	; AVX512BW-NEXT: vpslldq {{.*#+}} zmm0 = zero,zmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,zmm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30],zero,zmm0[32,33,34,35,36,37,38,39,40,41,42,43,44,45,46],zero,zmm0[48,49,50,51,52,53,54,55,56,57,58,59,60,61,62]
	; AVX512BW-NEXT: retq
	;
	; AVX512DQ-LABEL: shuffle_v64i8_zz_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_zz_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_zz_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_zz_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
	; AVX512DQ: # BB#0:
	; AVX512DQ-NEXT: vpslldq {{.*#+}} ymm0 = zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
	; AVX512DQ-NEXT: vpslldq {{.*#+}} ymm1 = zero,ymm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm1[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
	; AVX512DQ-NEXT: retq
	;
	; AVX512VBMI-LABEL: shuffle_v64i8_zz_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_zz_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_zz_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_zz_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
	; AVX512VBMI: # BB#0:
	; AVX512VBMI-NEXT: vpslldq {{.*#+}} zmm0 = zero,zmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,zmm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30],zero,zmm0[32,33,34,35,36,37,38,39,40,41,42,43,44,45,46],zero,zmm0[48,49,50,51,52,53,54,55,56,57,58,59,60,61,62]
	; AVX512VBMI-NEXT: retq
	%shuffle = shufflevector <64 x i8> %a, <64 x i8> zeroinitializer, <64 x i32> <i32 79, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 95, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 111, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 127, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62>
	ret <64 x i8> %shuffle
	}

	define <64 x i8> @shuffle_v64i8_02_03_04_05_06_07_08_09_10_11_12_13_14_15_zz_zz_18_19_20_21_22_23_24_25_26_27_28_29_30_31_zz_zz_34_35_36_37_38_39_40_41_42_43_44_45_46_47_zz_zz_50_51_52_53_54_55_56_57_58_59_60_61_62_63_zz_zz(<64 x i8> %a, <64 x i8> %b) {
	; AVX512F-LABEL: shuffle_v64i8_02_03_04_05_06_07_08_09_10_11_12_13_14_15_zz_zz_18_19_20_21_22_23_24_25_26_27_28_29_30_31_zz_zz_34_35_36_37_38_39_40_41_42_43_44_45_46_47_zz_zz_50_51_52_53_54_55_56_57_58_59_60_61_62_63_zz_zz:
	; AVX512F: # BB#0:
	; AVX512F-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero
	; AVX512F-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero
	; AVX512F-NEXT: retq
	;
	; AVX512BW-LABEL: shuffle_v64i8_02_03_04_05_06_07_08_09_10_11_12_13_14_15_zz_zz_18_19_20_21_22_23_24_25_26_27_28_29_30_31_zz_zz_34_35_36_37_38_39_40_41_42_43_44_45_46_47_zz_zz_50_51_52_53_54_55_56_57_58_59_60_61_62_63_zz_zz:
	; AVX512BW: # BB#0:
	; AVX512BW-NEXT: vpsrldq {{.*#+}} zmm0 = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zmm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zmm0[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zero,zero,zmm0[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zero,zero
	; AVX512BW-NEXT: retq
	;
	; AVX512DQ-LABEL: shuffle_v64i8_02_03_04_05_06_07_08_09_10_11_12_13_14_15_zz_zz_18_19_20_21_22_23_24_25_26_27_28_29_30_31_zz_zz_34_35_36_37_38_39_40_41_42_43_44_45_46_47_zz_zz_50_51_52_53_54_55_56_57_58_59_60_61_62_63_zz_zz:
	; AVX512DQ: # BB#0:
	; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero
	; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero
	; AVX512DQ-NEXT: retq
	;
	; AVX512VBMI-LABEL: shuffle_v64i8_02_03_04_05_06_07_08_09_10_11_12_13_14_15_zz_zz_18_19_20_21_22_23_24_25_26_27_28_29_30_31_zz_zz_34_35_36_37_38_39_40_41_42_43_44_45_46_47_zz_zz_50_51_52_53_54_55_56_57_58_59_60_61_62_63_zz_zz:
	; AVX512VBMI: # BB#0:
	; AVX512VBMI-NEXT: vpsrldq {{.*#+}} zmm0 = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zmm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zmm0[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zero,zero,zmm0[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zero,zero
	; AVX512VBMI-NEXT: retq
	%shuffle = shufflevector <64 x i8> %a, <64 x i8> zeroinitializer, <64 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 64, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 64, i32 64, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 64, i32 64, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 64>
	ret <64 x i8> %shuffle
	}

	define <64 x i8> @shuffle_v64i8_79_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_95_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_111_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62(<64 x i8> %a, <64 x i8> %b) {
	; AVX512F-LABEL: shuffle_v64i8_79_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_95_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_111_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
	; AVX512F: # BB#0:
	; AVX512F-NEXT: vpalignr {{.*#+}} ymm0 = ymm2[15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm2[31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
	; AVX512F-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm3[31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
	; AVX512F-NEXT: retq
	;
	; AVX512BW-LABEL: shuffle_v64i8_79_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_95_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_111_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
	; AVX512BW: # BB#0:
	; AVX512BW-NEXT: vpalignr {{.*#+}} zmm0 = zmm1[15],zmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zmm1[31],zmm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30],zmm1[47],zmm0[32,33,34,35,36,37,38,39,40,41,42,43,44,45,46],zmm1[63],zmm0[48,49,50,51,52,53,54,55,56,57,58,59,60,61,62]
	; AVX512BW-NEXT: retq
	;
	; AVX512DQ-LABEL: shuffle_v64i8_79_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_95_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_111_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
	; AVX512DQ: # BB#0:
	; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm0 = ymm2[15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm2[31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
	; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm3[31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
	; AVX512DQ-NEXT: retq
	;
	; AVX512VBMI-LABEL: shuffle_v64i8_79_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_95_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_111_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
	; AVX512VBMI: # BB#0:
	; AVX512VBMI-NEXT: vpalignr {{.*#+}} zmm0 = zmm1[15],zmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zmm1[31],zmm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30],zmm1[47],zmm0[32,33,34,35,36,37,38,39,40,41,42,43,44,45,46],zmm1[63],zmm0[48,49,50,51,52,53,54,55,56,57,58,59,60,61,62]
	; AVX512VBMI-NEXT: retq
	%shuffle = shufflevector <64 x i8> %a, <64 x i8> %b, <64 x i32> <i32 79, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 95, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 111, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 127, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62>
	ret <64 x i8> %shuffle
	}


	define <64 x i8> @shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz(<64 x i8> %a) {
	; AVX512F-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
	; AVX512F: # BB#0:
	; AVX512F-NEXT: movl $255, %eax
	; AVX512F-NEXT: vmovd %eax, %xmm1
	; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
	; AVX512F-NEXT: vpxor %ymm1, %ymm1, %ymm1
	; AVX512F-NEXT: retq
	;
	; AVX512BW-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
	; AVX512BW: # BB#0:
	; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
	; AVX512BW-NEXT: retq
	;
	; AVX512DQ-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
	; AVX512DQ: # BB#0:
	; AVX512DQ-NEXT: movl $255, %eax
	; AVX512DQ-NEXT: vmovd %eax, %xmm1
	; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0
	; AVX512DQ-NEXT: vpxor %ymm1, %ymm1, %ymm1
	; AVX512DQ-NEXT: retq
	;
	; AVX512VBMI-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
	; AVX512VBMI: # BB#0:
	; AVX512VBMI-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
	; AVX512VBMI-NEXT: retq
	%shuffle = shufflevector <64 x i8> %a, <64 x i8> zeroinitializer, <64 x i32> <i32 0, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64>
	ret <64 x i8> %shuffle
	}

	define <64 x i8> @shuffle_v64i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<64 x i8> %a, <64 x i8> %b) {
	; AVX512F-LABEL: shuffle_v64i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX512F: # BB#0:
	; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0
	; AVX512F-NEXT: vmovdqa %ymm0, %ymm1
	; AVX512F-NEXT: retq
	;
	; AVX512BW-LABEL: shuffle_v64i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX512BW: # BB#0:
	; AVX512BW-NEXT: vpbroadcastb %xmm0, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512DQ-LABEL: shuffle_v64i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX512DQ: # BB#0:
	; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0
	; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1
	; AVX512DQ-NEXT: retq
	;
	; AVX512VBMI-LABEL: shuffle_v64i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
	; AVX512VBMI: # BB#0:
	; AVX512VBMI-NEXT: vpbroadcastb %xmm0, %zmm0
	; AVX512VBMI-NEXT: retq
	%shuffle = shufflevector <64 x i8> %a, <64 x i8> %b, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <64 x i8> %shuffle
	}

	define <64 x i8> @shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00(<64 x i8> %a) {
	; AVX512F-LABEL: shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00:
	; AVX512F: # BB#0:
	; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
	; AVX512F-NEXT: vpshufb %ymm3, %ymm1, %ymm1
	; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
	; AVX512F-NEXT: vpshufb %ymm3, %ymm0, %ymm0
	; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
	; AVX512F-NEXT: vmovdqa %ymm2, %ymm0
	; AVX512F-NEXT: retq
	;
	; AVX512BW-LABEL: shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00:
	; AVX512BW: # BB#0:
	; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,47,46,45,44,43,42,41,40,39,38,37,36,35,34,33,32,63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48]
	; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5,2,3,0,1]
	; AVX512BW-NEXT: retq
	;
	; AVX512DQ-LABEL: shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00:
	; AVX512DQ: # BB#0:
	; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
	; AVX512DQ-NEXT: vpshufb %ymm3, %ymm1, %ymm1
	; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
	; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm0
	; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
	; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm0
	; AVX512DQ-NEXT: retq
	;
	; AVX512VBMI-LABEL: shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00:
	; AVX512VBMI: # BB#0:
	; AVX512VBMI-NEXT: vmovdqu8 {{.*#+}} zmm1 = [63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48,47,46,45,44,43,42,41,40,39,38,37,36,35,34,33,32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
	; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0
	; AVX512VBMI-NEXT: retq
	%shuffle = shufflevector <64 x i8> %a, <64 x i8> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
	ret <64 x i8> %shuffle
	}

	define <64 x i8> @insert_dup_mem_v64i8_i32(i32* %ptr) {
	; AVX512F-LABEL: insert_dup_mem_v64i8_i32:
	; AVX512F: # BB#0:
	; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0
	; AVX512F-NEXT: vmovdqa %ymm0, %ymm1
	; AVX512F-NEXT: retq
	;
	; AVX512BW-LABEL: insert_dup_mem_v64i8_i32:
	; AVX512BW: # BB#0:
	; AVX512BW-NEXT: vpbroadcastb (%rdi), %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512DQ-LABEL: insert_dup_mem_v64i8_i32:
	; AVX512DQ: # BB#0:
	; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0
	; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1
	; AVX512DQ-NEXT: retq
	;
	; AVX512VBMI-LABEL: insert_dup_mem_v64i8_i32:
	; AVX512VBMI: # BB#0:
	; AVX512VBMI-NEXT: vpbroadcastb (%rdi), %zmm0
	; AVX512VBMI-NEXT: retq
	%tmp = load i32, i32* %ptr, align 4
	%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
	%tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8>
	%tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <64 x i32> zeroinitializer
	ret <64 x i8> %tmp3
	}

	define <64 x i8> @insert_dup_mem_v64i8_sext_i8(i8* %ptr) {
	; AVX512F-LABEL: insert_dup_mem_v64i8_sext_i8:
	; AVX512F: # BB#0:
	; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0
	; AVX512F-NEXT: vmovdqa %ymm0, %ymm1
	; AVX512F-NEXT: retq
	;
	; AVX512BW-LABEL: insert_dup_mem_v64i8_sext_i8:
	; AVX512BW: # BB#0:
	; AVX512BW-NEXT: vpbroadcastb (%rdi), %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512DQ-LABEL: insert_dup_mem_v64i8_sext_i8:
	; AVX512DQ: # BB#0:
	; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0
	; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1
	; AVX512DQ-NEXT: retq
	;
	; AVX512VBMI-LABEL: insert_dup_mem_v64i8_sext_i8:
	; AVX512VBMI: # BB#0:
	; AVX512VBMI-NEXT: vpbroadcastb (%rdi), %zmm0
	; AVX512VBMI-NEXT: retq
	%tmp = load i8, i8* %ptr, align 1
	%tmp1 = sext i8 %tmp to i32
	%tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0
	%tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8>
	%tmp4 = shufflevector <16 x i8> %tmp3, <16 x i8> undef, <64 x i32> zeroinitializer
	ret <64 x i8> %tmp4
	}

	define <64 x i8> @insert_dup_elt1_mem_v64i8_i32(i32* %ptr) {
	; AVX512F-LABEL: insert_dup_elt1_mem_v64i8_i32:
	; AVX512F: # BB#0:
	; AVX512F-NEXT: vpbroadcastb 1(%rdi), %ymm0
	; AVX512F-NEXT: vmovdqa %ymm0, %ymm1
	; AVX512F-NEXT: retq
	;
	; AVX512BW-LABEL: insert_dup_elt1_mem_v64i8_i32:
	; AVX512BW: # BB#0:
	; AVX512BW-NEXT: vpbroadcastb 1(%rdi), %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512DQ-LABEL: insert_dup_elt1_mem_v64i8_i32:
	; AVX512DQ: # BB#0:
	; AVX512DQ-NEXT: vpbroadcastb 1(%rdi), %ymm0
	; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1
	; AVX512DQ-NEXT: retq
	;
	; AVX512VBMI-LABEL: insert_dup_elt1_mem_v64i8_i32:
	; AVX512VBMI: # BB#0:
	; AVX512VBMI-NEXT: vpbroadcastb 1(%rdi), %zmm0
	; AVX512VBMI-NEXT: retq
	%tmp = load i32, i32* %ptr, align 4
	%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
	%tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8>
	%tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <64 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
	ret <64 x i8> %tmp3
	}

	define <64 x i8> @insert_dup_elt3_mem_v64i8_i32(i32* %ptr) {
	; AVX512F-LABEL: insert_dup_elt3_mem_v64i8_i32:
	; AVX512F: # BB#0:
	; AVX512F-NEXT: vpbroadcastb 3(%rdi), %ymm0
	; AVX512F-NEXT: vmovdqa %ymm0, %ymm1
	; AVX512F-NEXT: retq
	;
	; AVX512BW-LABEL: insert_dup_elt3_mem_v64i8_i32:
	; AVX512BW: # BB#0:
	; AVX512BW-NEXT: vpbroadcastb 3(%rdi), %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512DQ-LABEL: insert_dup_elt3_mem_v64i8_i32:
	; AVX512DQ: # BB#0:
	; AVX512DQ-NEXT: vpbroadcastb 3(%rdi), %ymm0
	; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1
	; AVX512DQ-NEXT: retq
	;
	; AVX512VBMI-LABEL: insert_dup_elt3_mem_v64i8_i32:
	; AVX512VBMI: # BB#0:
	; AVX512VBMI-NEXT: vpbroadcastb 3(%rdi), %zmm0
	; AVX512VBMI-NEXT: retq
	%tmp = load i32, i32* %ptr, align 4
	%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
	%tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8>
	%tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <64 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
	ret <64 x i8> %tmp3
	}

	define <64 x i8> @insert_dup_elt1_mem_v64i8_sext_i8(i8* %ptr) {
	; AVX512F-LABEL: insert_dup_elt1_mem_v64i8_sext_i8:
	; AVX512F: # BB#0:
	; AVX512F-NEXT: movsbl (%rdi), %eax
	; AVX512F-NEXT: shrl $8, %eax
	; AVX512F-NEXT: vmovd %eax, %xmm0
	; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0
	; AVX512F-NEXT: vmovdqa %ymm0, %ymm1
	; AVX512F-NEXT: retq
	;
	; AVX512BW-LABEL: insert_dup_elt1_mem_v64i8_sext_i8:
	; AVX512BW: # BB#0:
	; AVX512BW-NEXT: movsbl (%rdi), %eax
	; AVX512BW-NEXT: shrl $8, %eax
	-; AVX512BW-NEXT: vpbroadcastb %al, %zmm0
	+; AVX512BW-NEXT: vpbroadcastb %eax, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512DQ-LABEL: insert_dup_elt1_mem_v64i8_sext_i8:
	; AVX512DQ: # BB#0:
	; AVX512DQ-NEXT: movsbl (%rdi), %eax
	; AVX512DQ-NEXT: shrl $8, %eax
	; AVX512DQ-NEXT: vmovd %eax, %xmm0
	; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0
	; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1
	; AVX512DQ-NEXT: retq
	;
	; AVX512VBMI-LABEL: insert_dup_elt1_mem_v64i8_sext_i8:
	; AVX512VBMI: # BB#0:
	; AVX512VBMI-NEXT: movsbl (%rdi), %eax
	; AVX512VBMI-NEXT: shrl $8, %eax
	-; AVX512VBMI-NEXT: vpbroadcastb %al, %zmm0
	+; AVX512VBMI-NEXT: vpbroadcastb %eax, %zmm0
	; AVX512VBMI-NEXT: retq
	%tmp = load i8, i8* %ptr, align 1
	%tmp1 = sext i8 %tmp to i32
	%tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0
	%tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8>
	%tmp4 = shufflevector <16 x i8> %tmp3, <16 x i8> undef, <64 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
	ret <64 x i8> %tmp4
	}

	define <64 x i8> @shuffle_v64i8_64_zz_zz_zz_zz_zz_zz_zz_65_zz_zz_zz_zz_zz_zz_zz_66_zz_zz_zz_zz_zz_zz_zz_67_zz_zz_zz_zz_zz_zz_zz_68_zz_zz_zz_zz_zz_zz_zz_69_zz_zz_zz_zz_zz_zz_zz_70_zz_zz_zz_zz_zz_zz_zz_71_zz_zz_zz_zz_zz_zz_zz(<64 x i8> %a) {
	; AVX512F-LABEL: shuffle_v64i8_64_zz_zz_zz_zz_zz_zz_zz_65_zz_zz_zz_zz_zz_zz_zz_66_zz_zz_zz_zz_zz_zz_zz_67_zz_zz_zz_zz_zz_zz_zz_68_zz_zz_zz_zz_zz_zz_zz_69_zz_zz_zz_zz_zz_zz_zz_70_zz_zz_zz_zz_zz_zz_zz_71_zz_zz_zz_zz_zz_zz_zz:
	; AVX512F: # BB#0:
	; AVX512F-NEXT: vpmovzxbq {{.*#+}} ymm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
	; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
	; AVX512F-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
	; AVX512F-NEXT: vmovdqa %ymm2, %ymm0
	; AVX512F-NEXT: retq
	;
	; AVX512BW-LABEL: shuffle_v64i8_64_zz_zz_zz_zz_zz_zz_zz_65_zz_zz_zz_zz_zz_zz_zz_66_zz_zz_zz_zz_zz_zz_zz_67_zz_zz_zz_zz_zz_zz_zz_68_zz_zz_zz_zz_zz_zz_zz_69_zz_zz_zz_zz_zz_zz_zz_70_zz_zz_zz_zz_zz_zz_zz_71_zz_zz_zz_zz_zz_zz_zz:
	; AVX512BW: # BB#0:
	; AVX512BW-NEXT: vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
	; AVX512BW-NEXT: retq
	;
	; AVX512DQ-LABEL: shuffle_v64i8_64_zz_zz_zz_zz_zz_zz_zz_65_zz_zz_zz_zz_zz_zz_zz_66_zz_zz_zz_zz_zz_zz_zz_67_zz_zz_zz_zz_zz_zz_zz_68_zz_zz_zz_zz_zz_zz_zz_69_zz_zz_zz_zz_zz_zz_zz_70_zz_zz_zz_zz_zz_zz_zz_71_zz_zz_zz_zz_zz_zz_zz:
	; AVX512DQ: # BB#0:
	; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} ymm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
	; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
	; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
	; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm0
	; AVX512DQ-NEXT: retq
	;
	; AVX512VBMI-LABEL: shuffle_v64i8_64_zz_zz_zz_zz_zz_zz_zz_65_zz_zz_zz_zz_zz_zz_zz_66_zz_zz_zz_zz_zz_zz_zz_67_zz_zz_zz_zz_zz_zz_zz_68_zz_zz_zz_zz_zz_zz_zz_69_zz_zz_zz_zz_zz_zz_zz_70_zz_zz_zz_zz_zz_zz_zz_71_zz_zz_zz_zz_zz_zz_zz:
	; AVX512VBMI: # BB#0:
	; AVX512VBMI-NEXT: vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
	; AVX512VBMI-NEXT: retq
	%shuffle = shufflevector <64 x i8> zeroinitializer, <64 x i8> %a, <64 x i32> <i32 64, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 65, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 66, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 67, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 68, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 69, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 70, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 71, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <64 x i8> %shuffle
	}

	define <64 x i8> @shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_68_zz_zz_zz_69_zz_zz_zz_70_zz_zz_zz_71_zz_zz_zz_72_zz_zz_zz_73_zz_zz_zz_74_zz_zz_zz_75_zz_zz_zz_76_zz_zz_zz_77_zz_zz_zz_78_zz_zz_zz_79_zz_zz_zz(<64 x i8> %a) {
	; AVX512F-LABEL: shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_68_zz_zz_zz_69_zz_zz_zz_70_zz_zz_zz_71_zz_zz_zz_72_zz_zz_zz_73_zz_zz_zz_74_zz_zz_zz_75_zz_zz_zz_76_zz_zz_zz_77_zz_zz_zz_78_zz_zz_zz_79_zz_zz_zz:
	; AVX512F: # BB#0:
	; AVX512F-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
	; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
	; AVX512F-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
	; AVX512F-NEXT: vmovdqa %ymm2, %ymm0
	; AVX512F-NEXT: retq
	;
	; AVX512BW-LABEL: shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_68_zz_zz_zz_69_zz_zz_zz_70_zz_zz_zz_71_zz_zz_zz_72_zz_zz_zz_73_zz_zz_zz_74_zz_zz_zz_75_zz_zz_zz_76_zz_zz_zz_77_zz_zz_zz_78_zz_zz_zz_79_zz_zz_zz:
	; AVX512BW: # BB#0:
	; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
	; AVX512BW-NEXT: retq
	;
	; AVX512DQ-LABEL: shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_68_zz_zz_zz_69_zz_zz_zz_70_zz_zz_zz_71_zz_zz_zz_72_zz_zz_zz_73_zz_zz_zz_74_zz_zz_zz_75_zz_zz_zz_76_zz_zz_zz_77_zz_zz_zz_78_zz_zz_zz_79_zz_zz_zz:
	; AVX512DQ: # BB#0:
	; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
	; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
	; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
	; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm0
	; AVX512DQ-NEXT: retq
	;
	; AVX512VBMI-LABEL: shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_68_zz_zz_zz_69_zz_zz_zz_70_zz_zz_zz_71_zz_zz_zz_72_zz_zz_zz_73_zz_zz_zz_74_zz_zz_zz_75_zz_zz_zz_76_zz_zz_zz_77_zz_zz_zz_78_zz_zz_zz_79_zz_zz_zz:
	; AVX512VBMI: # BB#0:
	; AVX512VBMI-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
	; AVX512VBMI-NEXT: retq
	%shuffle = shufflevector <64 x i8> zeroinitializer, <64 x i8> %a, <64 x i32> <i32 64, i32 0, i32 0, i32 0, i32 65, i32 0, i32 0, i32 0, i32 66, i32 0, i32 0, i32 0, i32 67, i32 0, i32 0, i32 0, i32 68, i32 0, i32 0, i32 0, i32 69, i32 0, i32 0, i32 0, i32 70, i32 0, i32 0, i32 0, i32 71, i32 0, i32 0, i32 0, i32 72, i32 0, i32 0, i32 0, i32 73, i32 0, i32 0, i32 0, i32 74, i32 0, i32 0, i32 0, i32 75, i32 0, i32 0, i32 0, i32 76, i32 0, i32 0, i32 0, i32 77, i32 0, i32 0, i32 0, i32 78, i32 0, i32 0, i32 0, i32 79, i32 0, i32 0, i32 0>
	ret <64 x i8> %shuffle
	}

	define <64 x i8> @shuffle_v64i8_64_zz_65_zz_66_zz_67_zz_68_zz_69_zz_70_zz_71_zz_72_zz_73_zz_74_zz_75_zz_76_zz_77_zz_78_zz_79_zz_80_zz_81_zz_82_zz_83_zz_84_zz_85_zz_86_zz_87_zz_88_zz_89_zz_90_zz_91_zz_92_zz_93_zz_94_zz_95_zz(<64 x i8> %a) {
	; AVX512F-LABEL: shuffle_v64i8_64_zz_65_zz_66_zz_67_zz_68_zz_69_zz_70_zz_71_zz_72_zz_73_zz_74_zz_75_zz_76_zz_77_zz_78_zz_79_zz_80_zz_81_zz_82_zz_83_zz_84_zz_85_zz_86_zz_87_zz_88_zz_89_zz_90_zz_91_zz_92_zz_93_zz_94_zz_95_zz:
	; AVX512F: # BB#0:
	; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
	; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
	; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
	; AVX512F-NEXT: vmovdqa %ymm2, %ymm0
	; AVX512F-NEXT: retq
	;
	; AVX512BW-LABEL: shuffle_v64i8_64_zz_65_zz_66_zz_67_zz_68_zz_69_zz_70_zz_71_zz_72_zz_73_zz_74_zz_75_zz_76_zz_77_zz_78_zz_79_zz_80_zz_81_zz_82_zz_83_zz_84_zz_85_zz_86_zz_87_zz_88_zz_89_zz_90_zz_91_zz_92_zz_93_zz_94_zz_95_zz:
	; AVX512BW: # BB#0:
	; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
	; AVX512BW-NEXT: retq
	;
	; AVX512DQ-LABEL: shuffle_v64i8_64_zz_65_zz_66_zz_67_zz_68_zz_69_zz_70_zz_71_zz_72_zz_73_zz_74_zz_75_zz_76_zz_77_zz_78_zz_79_zz_80_zz_81_zz_82_zz_83_zz_84_zz_85_zz_86_zz_87_zz_88_zz_89_zz_90_zz_91_zz_92_zz_93_zz_94_zz_95_zz:
	; AVX512DQ: # BB#0:
	; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
	; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0
	; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
	; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm0
	; AVX512DQ-NEXT: retq
	;
	; AVX512VBMI-LABEL: shuffle_v64i8_64_zz_65_zz_66_zz_67_zz_68_zz_69_zz_70_zz_71_zz_72_zz_73_zz_74_zz_75_zz_76_zz_77_zz_78_zz_79_zz_80_zz_81_zz_82_zz_83_zz_84_zz_85_zz_86_zz_87_zz_88_zz_89_zz_90_zz_91_zz_92_zz_93_zz_94_zz_95_zz:
	; AVX512VBMI: # BB#0:
	; AVX512VBMI-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
	; AVX512VBMI-NEXT: retq
	%shuffle = shufflevector <64 x i8> zeroinitializer, <64 x i8> %a, <64 x i32> <i32 64, i32 0, i32 65, i32 0, i32 66, i32 0, i32 67, i32 0, i32 68, i32 0, i32 69, i32 0, i32 70, i32 0, i32 71, i32 0, i32 72, i32 0, i32 73, i32 0, i32 74, i32 0, i32 75, i32 0, i32 76, i32 0, i32 77, i32 0, i32 78, i32 0, i32 79, i32 0, i32 80, i32 0, i32 81, i32 0, i32 82, i32 0, i32 83, i32 0, i32 84, i32 0, i32 85, i32 0, i32 86, i32 0, i32 87, i32 0, i32 88, i32 0, i32 89, i32 0, i32 90, i32 0, i32 91, i32 0, i32 92, i32 0, i32 93, i32 0, i32 94, i32 0, i32 95, i32 0>
	ret <64 x i8> %shuffle
	}

	define <64 x i8> @shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz(<64 x i8> %a) {
	; AVX512F-LABEL: shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz:
	; AVX512F: # BB#0:
	; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = <15,u,13,u,11,u,9,u,7,u,5,u,3,u,1,u,15,u,13,u,11,u,9,u,7,u,5,u,3,u,1,u>
	; AVX512F-NEXT: vpshufb %ymm3, %ymm1, %ymm1
	; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3,0,1]
	; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
	; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm2
	; AVX512F-NEXT: vpshufb %ymm3, %ymm0, %ymm0
	; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
	; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm1
	; AVX512F-NEXT: vmovdqa %ymm2, %ymm0
	; AVX512F-NEXT: retq
	;
	; AVX512BW-LABEL: shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz:
	; AVX512BW: # BB#0:
	; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = <15,u,13,u,11,u,9,u,7,u,5,u,3,u,1,u,15,u,13,u,11,u,9,u,7,u,5,u,3,u,1,u>
	; AVX512BW-NEXT: vpshufb %ymm1, %ymm0, %ymm2
	; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3,0,1]
	; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
	; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2
	; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0
	; AVX512BW-NEXT: vpshufb %ymm1, %ymm0, %ymm0
	; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
	; AVX512BW-NEXT: vpand %ymm3, %ymm0, %ymm0
	; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512DQ-LABEL: shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz:
	; AVX512DQ: # BB#0:
	; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = <15,u,13,u,11,u,9,u,7,u,5,u,3,u,1,u,15,u,13,u,11,u,9,u,7,u,5,u,3,u,1,u>
	; AVX512DQ-NEXT: vpshufb %ymm3, %ymm1, %ymm1
	; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3,0,1]
	; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
	; AVX512DQ-NEXT: vpand %ymm4, %ymm1, %ymm2
	; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm0
	; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
	; AVX512DQ-NEXT: vpand %ymm4, %ymm0, %ymm1
	; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm0
	; AVX512DQ-NEXT: retq
	;
	; AVX512VBMI-LABEL: shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz:
	; AVX512VBMI: # BB#0:
	; AVX512VBMI-NEXT: vpxord %zmm1, %zmm1, %zmm1
	; AVX512VBMI-NEXT: vmovdqu8 {{.*#+}} zmm2 = [63,65,61,67,59,69,57,71,55,73,53,75,51,77,49,79,47,81,45,83,43,85,41,87,39,89,37,91,35,93,33,95,31,97,29,99,27,101,25,103,23,105,21,107,19,109,17,111,15,113,13,115,11,117,9,119,7,121,5,123,3,125,1,127]
	; AVX512VBMI-NEXT: vpermt2b %zmm1, %zmm2, %zmm0
	; AVX512VBMI-NEXT: retq
	%shuffle = shufflevector <64 x i8> %a, <64 x i8> zeroinitializer, <64 x i32> <i32 63, i32 64, i32 61, i32 64, i32 59, i32 64, i32 57, i32 64, i32 55, i32 64, i32 53, i32 64, i32 51, i32 64, i32 49, i32 64, i32 47, i32 64, i32 45, i32 64, i32 43, i32 64, i32 41, i32 64, i32 39, i32 64, i32 37, i32 64, i32 35, i32 64, i32 33, i32 64, i32 31, i32 64, i32 29, i32 64, i32 27, i32 64, i32 25, i32 64, i32 23, i32 64, i32 21, i32 64, i32 19, i32 64, i32 17, i32 64, i32 15, i32 64, i32 13, i32 64, i32 11, i32 64, i32 9, i32 64, i32 7, i32 64, i32 5, i32 64, i32 3, i32 64, i32 1, i32 64>
	ret <64 x i8> %shuffle
	}

	define <64 x i8> @shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126(<64 x i8> %a, <64 x i8> %b) {
	; AVX512F-LABEL: shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126:
	; AVX512F: # BB#0:
	; AVX512F-NEXT: vpbroadcastw {{.*}}(%rip), %ymm4
	; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm1
	; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
	; AVX512F-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
	; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14,15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14]
	; AVX512F-NEXT: vpshufb %ymm5, %ymm1, %ymm2
	; AVX512F-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
	; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
	; AVX512F-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0
	; AVX512F-NEXT: vpshufb %ymm5, %ymm0, %ymm1
	; AVX512F-NEXT: vmovdqa %ymm2, %ymm0
	; AVX512F-NEXT: retq
	;
	; AVX512BW-LABEL: shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126:
	; AVX512BW: # BB#0:
	; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm2
	; AVX512BW-NEXT: vpbroadcastw {{.*}}(%rip), %ymm3
	; AVX512BW-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm2
	; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm2[2,3,0,1]
	; AVX512BW-NEXT: vpblendvb %ymm3, %ymm2, %ymm4, %ymm2
	; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14,15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14]
	; AVX512BW-NEXT: vpshufb %ymm4, %ymm2, %ymm2
	; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0
	; AVX512BW-NEXT: vpblendvb %ymm3, %ymm1, %ymm0, %ymm0
	; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
	; AVX512BW-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0
	; AVX512BW-NEXT: vpshufb %ymm4, %ymm0, %ymm0
	; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
	; AVX512BW-NEXT: retq
	;
	; AVX512DQ-LABEL: shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126:
	; AVX512DQ: # BB#0:
	; AVX512DQ-NEXT: vpbroadcastw {{.*}}(%rip), %ymm4
	; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm1
	; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
	; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
	; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14,15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14]
	; AVX512DQ-NEXT: vpshufb %ymm5, %ymm1, %ymm2
	; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
	; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
	; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0
	; AVX512DQ-NEXT: vpshufb %ymm5, %ymm0, %ymm1
	; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm0
	; AVX512DQ-NEXT: retq
	;
	; AVX512VBMI-LABEL: shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126:
	; AVX512VBMI: # BB#0:
	; AVX512VBMI-NEXT: vmovdqu8 {{.*#+}} zmm2 = [63,64,61,66,59,68,57,70,55,72,53,74,51,76,49,78,47,80,45,82,43,84,41,86,39,88,37,90,35,92,33,94,31,96,29,98,27,100,25,102,23,104,21,106,19,108,17,110,15,112,13,114,11,116,9,118,7,120,5,122,3,124,1,126]
	; AVX512VBMI-NEXT: vpermt2b %zmm1, %zmm2, %zmm0
	; AVX512VBMI-NEXT: retq
	%shuffle = shufflevector <64 x i8> %a, <64 x i8> %b, <64 x i32> <i32 63, i32 64, i32 61, i32 66, i32 59, i32 68, i32 57, i32 70, i32 55, i32 72, i32 53, i32 74, i32 51, i32 76, i32 49, i32 78, i32 47, i32 80, i32 45, i32 82, i32 43, i32 84, i32 41, i32 86, i32 39, i32 88, i32 37, i32 90, i32 35, i32 92, i32 33, i32 94, i32 31, i32 96, i32 29, i32 98, i32 27, i32 100, i32 25, i32 102, i32 23, i32 104, i32 21, i32 106, i32 19, i32 108, i32 17, i32 110, i32 15, i32 112, i32 13, i32 114, i32 11, i32 116, i32 9, i32 118, i32 7, i32 120, i32 5, i32 122, i32 3, i32 124, i32 1, i32 126>
	ret <64 x i8> %shuffle
	}
	diff --git a/test/Instrumentation/DataFlowSanitizer/Inputs/shadow-args-abilist.txt b/test/Instrumentation/DataFlowSanitizer/Inputs/shadow-args-abilist.txt
	new file mode 100644
	index 000000000000..723cbc9086da
	--- /dev/null
	+++ b/test/Instrumentation/DataFlowSanitizer/Inputs/shadow-args-abilist.txt
	@@ -0,0 +1,8 @@
	+fun:dfsan_get_label=uninstrumented
	+fun:dfsan_get_label=custom
	+
	+fun:k2=uninstrumented
	+fun:k2=custom
	+
	+fun:k4=uninstrumented
	+fun:k4=custom
	diff --git a/test/Instrumentation/DataFlowSanitizer/abilist.ll b/test/Instrumentation/DataFlowSanitizer/abilist.ll
	index 8b30875a03fa..e33237ffe19d 100644
	--- a/test/Instrumentation/DataFlowSanitizer/abilist.ll
	+++ b/test/Instrumentation/DataFlowSanitizer/abilist.ll
	@@ -1,100 +1,100 @@
	; RUN: opt < %s -dfsan -dfsan-args-abi -dfsan-abilist=%S/Inputs/abilist.txt -S \| FileCheck %s
	target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
	target triple = "x86_64-unknown-linux-gnu"

	; CHECK: i32 @discard(i32 %a, i32 %b)
	define i32 @discard(i32 %a, i32 %b) {
	ret i32 0
	}

	; CHECK: i32 @functional(i32 %a, i32 %b)
	define i32 @functional(i32 %a, i32 %b) {
	%c = add i32 %a, %b
	ret i32 %c
	}

	; CHECK: define i32 (i32, i32)* @discardg(i32)
	; CHECK: %[[CALL:.]] = call { i32 (i32, i32), i16 } @"dfs$g"(i32 %0, i16 0)
	; CHECK: %[[XVAL:.]] = extractvalue { i32 (i32, i32), i16 } %[[CALL]], 0
	; CHECK: ret {{.*}} %[[XVAL]]
	@discardg = alias i32 (i32, i32)* (i32), i32 (i32, i32)* (i32)* @g

	declare void @custom1(i32 %a, i32 %b)

	; CHECK: define linkonce_odr { i32, i16 } @"dfsw$custom2"(i32, i32, i16, i16)
	; CHECK: %[[LABELRETURN2:.*]] = alloca i16
	; CHECK: %[[RV:.*]] = call i32 @__dfsw_custom2
	; CHECK: %[[RVSHADOW:.]] = load i16, i16 %[[LABELRETURN2]]
	; CHECK: insertvalue {{.*}}[[RV]], 0
	; CHECK: insertvalue {{.*}}[[RVSHADOW]], 1
	; CHECK: ret { i32, i16 }
	declare i32 @custom2(i32 %a, i32 %b)

	; CHECK: define linkonce_odr void @"dfsw$custom3"(i32, i16, i16*, ...)
	; CHECK: call void @__dfsan_vararg_wrapper(i8*
	; CHECK: unreachable
	declare void @custom3(i32 %a, ...)

	declare i32 @custom4(i32 %a, ...)

	declare void @customcb(i32 (i32)* %cb)

	declare i32 @cb(i32)

	; CHECK: @"dfs$f"
	define void @f(i32 %x) {
	; CHECK: %[[LABELVA2:.*]] = alloca [2 x i16]
	; CHECK: %[[LABELVA1:.*]] = alloca [2 x i16]
	; CHECK: %[[LABELRETURN:.*]] = alloca i16

	- ; CHECK: call void @__dfsw_custom1(i32 1, i32 2, i16 0, i16 0)
	+ ; CHECK: call void @__dfsw_custom1(i32 1, i32 2, i16 zeroext 0, i16 zeroext 0)
	call void @custom1(i32 1, i32 2)

	- ; CHECK: call i32 @__dfsw_custom2(i32 1, i32 2, i16 0, i16 0, i16* %[[LABELRETURN]])
	+ ; CHECK: call i32 @__dfsw_custom2(i32 1, i32 2, i16 zeroext 0, i16 zeroext 0, i16* %[[LABELRETURN]])
	call i32 @custom2(i32 1, i32 2)

	- ; CHECK: call void @__dfsw_customcb({{.}} @"dfst0$customcb", i8 bitcast ({{.}} @"dfs$cb" to i8), i16 0)
	+ ; CHECK: call void @__dfsw_customcb({{.}} @"dfst0$customcb", i8 bitcast ({{.}} @"dfs$cb" to i8), i16 zeroext 0)
	call void @customcb(i32 (i32)* @cb)

	; CHECK: %[[LABELVA1_0:.]] = getelementptr inbounds [2 x i16], [2 x i16] %[[LABELVA1]], i32 0, i32 0
	; CHECK: store i16 0, i16* %[[LABELVA1_0]]
	; CHECK: %[[LABELVA1_1:.]] = getelementptr inbounds [2 x i16], [2 x i16] %[[LABELVA1]], i32 0, i32 1
	; CHECK: store i16 %{{.}}, i16 %[[LABELVA1_1]]
	; CHECK: %[[LABELVA1_0A:.]] = getelementptr inbounds [2 x i16], [2 x i16] %[[LABELVA1]], i32 0, i32 0
	- ; CHECK: call void (i32, i16, i16, ...) @__dfsw_custom3(i32 1, i16 0, i16 %[[LABELVA1_0A]], i32 2, i32 %{{.*}})
	+ ; CHECK: call void (i32, i16, i16, ...) @__dfsw_custom3(i32 1, i16 zeroext 0, i16 %[[LABELVA1_0A]], i32 2, i32 %{{.*}})
	call void (i32, ...) @custom3(i32 1, i32 2, i32 %x)

	; CHECK: %[[LABELVA2_0:.]] = getelementptr inbounds [2 x i16], [2 x i16] %[[LABELVA2]], i32 0, i32 0
	; CHECK: %[[LABELVA2_0A:.]] = getelementptr inbounds [2 x i16], [2 x i16] %[[LABELVA2]], i32 0, i32 0
	- ; CHECK: call i32 (i32, i16, i16, i16, ...) @__dfsw_custom4(i32 1, i16 0, i16* %[[LABELVA2_0A]], i16* %[[LABELRETURN]], i32 2, i32 3)
	+ ; CHECK: call i32 (i32, i16, i16, i16, ...) @__dfsw_custom4(i32 1, i16 zeroext 0, i16* %[[LABELVA2_0A]], i16* %[[LABELRETURN]], i32 2, i32 3)
	call i32 (i32, ...) @custom4(i32 1, i32 2, i32 3)

	ret void
	}

	; CHECK: @"dfs$g"
	define i32 (i32, i32)* @g(i32) {
	; CHECK: ret {{.*}} @"dfsw$custom2"
	ret i32 (i32, i32)* @custom2
	}

	; CHECK: define { i32, i16 } @"dfs$adiscard"(i32, i32, i16, i16)
	; CHECK: %[[CALL:.*]] = call i32 @discard(i32 %0, i32 %1)
	; CHECK: %[[IVAL0:.*]] = insertvalue { i32, i16 } undef, i32 %[[CALL]], 0
	; CHECK: %[[IVAL1:.*]] = insertvalue { i32, i16 } %[[IVAL0]], i16 0, 1
	; CHECK: ret { i32, i16 } %[[IVAL1]]
	@adiscard = alias i32 (i32, i32), i32 (i32, i32)* @discard

	; CHECK: declare void @__dfsw_custom1(i32, i32, i16, i16)
	; CHECK: declare i32 @__dfsw_custom2(i32, i32, i16, i16, i16*)

	; CHECK-LABEL: define linkonce_odr i32 @"dfst0$customcb"(i32 (i32), i32, i16, i16)
	; CHECK: %[[BC:.]] = bitcast i32 (i32) %0 to { i32, i16 } (i32, i16)*
	; CHECK: %[[CALL:.*]] = call { i32, i16 } %[[BC]](i32 %1, i16 %2)
	; CHECK: %[[XVAL0:.*]] = extractvalue { i32, i16 } %[[CALL]], 0
	; CHECK: %[[XVAL1:.*]] = extractvalue { i32, i16 } %[[CALL]], 1
	; CHECK: store i16 %[[XVAL1]], i16* %3
	; CHECK: ret i32 %[[XVAL0]]

	; CHECK: declare void @__dfsw_custom3(i32, i16, i16*, ...)
	; CHECK: declare i32 @__dfsw_custom4(i32, i16, i16, i16, ...)
	diff --git a/test/Instrumentation/DataFlowSanitizer/shadow-args-zext.ll b/test/Instrumentation/DataFlowSanitizer/shadow-args-zext.ll
	new file mode 100644
	index 000000000000..0ffbf1970e7f
	--- /dev/null
	+++ b/test/Instrumentation/DataFlowSanitizer/shadow-args-zext.ll
	@@ -0,0 +1,54 @@
	+; RUN: opt -mtriple=x86_64-unknown-linux-gnu < %s -dfsan -S --dfsan-abilist=%S/Inputs/shadow-args-abilist.txt \| FileCheck %s
	+
	+; REQUIRES: x86-registered-target
	+
	+; Test that the custom abi marks shadow parameters as zero extended.
	+
	+define i32 @m() {
	+entry:
	+ %call = call zeroext i16 @dfsan_get_label(i64 signext 56)
	+ %conv = zext i16 %call to i32
	+ ret i32 %conv
	+}
	+
	+; CHECK-LABEL: @"dfs$m"
	+; CHECK: %{{.}} = call zeroext i16 @__dfsw_dfsan_get_label(i64 signext 56, i16 zeroext 0, i16 %{{.*}})
	+
	+define i32 @k() {
	+entry:
	+ %call = call zeroext i16 @k2(i64 signext 56, i64 signext 67)
	+ %conv = zext i16 %call to i32
	+ ret i32 %conv
	+}
	+
	+; CHECK-LABEL: @"dfs$k"
	+; CHECK: %{{.}} = call zeroext i16 @__dfsw_k2(i64 signext 56, i64 signext 67, i16 zeroext {{.}}, i16 zeroext {{.}}, i16 %{{.*}})
	+
	+define i32 @k3() {
	+entry:
	+ %call = call zeroext i16 @k4(i64 signext 56, i64 signext 67, i64 signext 78, i64 signext 89)
	+ %conv = zext i16 %call to i32
	+ ret i32 %conv
	+}
	+
	+; CHECK-LABEL: @"dfs$k3"
	+; CHECK: %{{.}} = call zeroext i16 @__dfsw_k4(i64 signext 56, i64 signext 67, i64 signext 78, i64 signext 89, i16 zeroext {{.}}, i16 zeroext {{.}}, i16 zeroext {{.}}, i16 zeroext {{.}}, i16 %{{.*}})
	+
	+declare zeroext i16 @dfsan_get_label(i64 signext)
	+
	+; CHECK-LABEL: @"dfsw$dfsan_get_label"
	+; CHECK: %{{.}} = call i16 @__dfsw_dfsan_get_label(i64 %0, i16 zeroext %1, i16 %{{.*}})
	+
	+declare zeroext i16 @k2(i64 signext, i64 signext)
	+; CHECK-LABEL: @"dfsw$k2"
	+; CHECK: %{{.}} = call i16 @__dfsw_k2(i64 %{{.}}, i64 %{{.}}, i16 zeroext %{{.}}, i16 zeroext %{{.}}, i16 %{{.*}})
	+
	+declare zeroext i16 @k4(i64 signext, i64 signext, i64 signext, i64 signext)
	+
	+; CHECK-LABEL: @"dfsw$k4"
	+; CHECK: %{{.}} = call i16 @__dfsw_k4(i64 %{{.}}, i64 %{{.}}, i64 %{{.}}, i64 %{{.}}, i16 zeroext %{{.}}, i16 zeroext %{{.}}, i16 zeroext %{{.}}, i16 zeroext %{{.}}, i16 %{{.*}})
	+
	+
	+; CHECK: declare zeroext i16 @__dfsw_dfsan_get_label(i64 signext, i16, i16*)
	+; CHECK: declare zeroext i16 @__dfsw_k2(i64 signext, i64 signext, i16, i16, i16*)
	+; CHECK: declare zeroext i16 @__dfsw_k4(i64 signext, i64 signext, i64 signext, i64 signext, i16, i16, i16, i16, i16*)
	diff --git a/test/Transforms/BDCE/invalidate-assumptions.ll b/test/Transforms/BDCE/invalidate-assumptions.ll
	new file mode 100644
	index 000000000000..d165d74be86d
	--- /dev/null
	+++ b/test/Transforms/BDCE/invalidate-assumptions.ll
	@@ -0,0 +1,100 @@
	+; RUN: opt -bdce %s -S \| FileCheck %s
	+
	+; The 'nuw' on the subtract allows us to deduce that %setbit is not demanded.
	+; But if we change that value to '0', then the 'nuw' is no longer valid. If we don't
	+; remove the 'nuw', another pass (-instcombine) may make a transform based on an
	+; that incorrect assumption and we can miscompile:
	+; https://bugs.llvm.org/show_bug.cgi?id=33695
	+
	+define i1 @PR33695(i1 %b, i8 %x) {
	+; CHECK-LABEL: @PR33695(
	+; CHECK-NEXT: [[SETBIT:%.*]] = or i8 %x, 64
	+; CHECK-NEXT: [[LITTLE_NUMBER:%.*]] = zext i1 %b to i8
	+; CHECK-NEXT: [[BIG_NUMBER:%.*]] = shl i8 0, 1
	+; CHECK-NEXT: [[SUB:%.*]] = sub i8 [[BIG_NUMBER]], [[LITTLE_NUMBER]]
	+; CHECK-NEXT: [[TRUNC:%.*]] = trunc i8 [[SUB]] to i1
	+; CHECK-NEXT: ret i1 [[TRUNC]]
	+;
	+ %setbit = or i8 %x, 64
	+ %little_number = zext i1 %b to i8
	+ %big_number = shl i8 %setbit, 1
	+ %sub = sub nuw i8 %big_number, %little_number
	+ %trunc = trunc i8 %sub to i1
	+ ret i1 %trunc
	+}
	+
	+; Similar to above, but now with more no-wrap.
	+; https://bugs.llvm.org/show_bug.cgi?id=34037
	+
	+define i64 @PR34037(i64 %m, i32 %r, i64 %j, i1 %b, i32 %k, i64 %p) {
	+; CHECK-LABEL: @PR34037(
	+; CHECK-NEXT: [[CONV:%.*]] = zext i32 %r to i64
	+; CHECK-NEXT: [[AND:%.*]] = and i64 %m, 0
	+; CHECK-NEXT: [[NEG:%.*]] = xor i64 0, 34359738367
	+; CHECK-NEXT: [[OR:%.*]] = or i64 %j, 0
	+; CHECK-NEXT: [[SHL:%.*]] = shl i64 0, 29
	+; CHECK-NEXT: [[CONV1:%.*]] = select i1 %b, i64 7, i64 0
	+; CHECK-NEXT: [[SUB:%.*]] = sub i64 [[SHL]], [[CONV1]]
	+; CHECK-NEXT: [[CONV2:%.*]] = zext i32 %k to i64
	+; CHECK-NEXT: [[MUL:%.*]] = mul i64 [[SUB]], [[CONV2]]
	+; CHECK-NEXT: [[CONV4:%.*]] = and i64 %p, 65535
	+; CHECK-NEXT: [[AND5:%.*]] = and i64 [[MUL]], [[CONV4]]
	+; CHECK-NEXT: ret i64 [[AND5]]
	+;
	+ %conv = zext i32 %r to i64
	+ %and = and i64 %m, %conv
	+ %neg = xor i64 %and, 34359738367
	+ %or = or i64 %j, %neg
	+ %shl = shl i64 %or, 29
	+ %conv1 = select i1 %b, i64 7, i64 0
	+ %sub = sub nuw nsw i64 %shl, %conv1
	+ %conv2 = zext i32 %k to i64
	+ %mul = mul nsw i64 %sub, %conv2
	+ %conv4 = and i64 %p, 65535
	+ %and5 = and i64 %mul, %conv4
	+ ret i64 %and5
	+}
	+
	+; This is a manufactured example based on the 1st test to prove that the
	+; assumption-killing algorithm stops at the call. Ie, it does not remove
	+; nsw/nuw from the 'add' because a call demands all bits of its argument.
	+
	+declare i1 @foo(i1)
	+
	+define i1 @poison_on_call_user_is_ok(i1 %b, i8 %x) {
	+; CHECK-LABEL: @poison_on_call_user_is_ok(
	+; CHECK-NEXT: [[SETBIT:%.*]] = or i8 %x, 64
	+; CHECK-NEXT: [[LITTLE_NUMBER:%.*]] = zext i1 %b to i8
	+; CHECK-NEXT: [[BIG_NUMBER:%.*]] = shl i8 0, 1
	+; CHECK-NEXT: [[SUB:%.*]] = sub i8 [[BIG_NUMBER]], [[LITTLE_NUMBER]]
	+; CHECK-NEXT: [[TRUNC:%.*]] = trunc i8 [[SUB]] to i1
	+; CHECK-NEXT: [[CALL_RESULT:%.*]] = call i1 @foo(i1 [[TRUNC]])
	+; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i1 [[CALL_RESULT]], true
	+; CHECK-NEXT: [[MUL:%.*]] = mul i1 [[TRUNC]], [[ADD]]
	+; CHECK-NEXT: ret i1 [[MUL]]
	+;
	+ %setbit = or i8 %x, 64
	+ %little_number = zext i1 %b to i8
	+ %big_number = shl i8 %setbit, 1
	+ %sub = sub nuw i8 %big_number, %little_number
	+ %trunc = trunc i8 %sub to i1
	+ %call_result = call i1 @foo(i1 %trunc)
	+ %add = add nsw nuw i1 %call_result, 1
	+ %mul = mul i1 %trunc, %add
	+ ret i1 %mul
	+}
	+
	+
	+; We were asserting that all users of a trivialized integer-type instruction were
	+; also integer-typed, but that's too strong. The alloca has a pointer-type result.
	+
	+define void @PR34179(i32* %a) {
	+; CHECK-LABEL: @PR34179(
	+; CHECK-NEXT: [[T0:%.]] = load volatile i32, i32 %a
	+; CHECK-NEXT: ret void
	+;
	+ %t0 = load volatile i32, i32* %a
	+ %vla = alloca i32, i32 %t0
	+ ret void
	+}
	+
	diff --git a/test/Transforms/IndVarSimplify/exit_value_test2.ll b/test/Transforms/IndVarSimplify/exit_value_test2.ll
	index ee641667506c..7b6e91a742b2 100644
	--- a/test/Transforms/IndVarSimplify/exit_value_test2.ll
	+++ b/test/Transforms/IndVarSimplify/exit_value_test2.ll
	@@ -1,52 +1,74 @@
	; PR23538
	; RUN: opt < %s -indvars -loop-deletion -S \| FileCheck %s

	; Check IndVarSimplify should not replace exit value because or else
	; udiv will be introduced by expand and the cost will be high.
	-;
	-; CHECK-LABEL: @_Z3fooPKcjj(
	-; CHECK-NOT: udiv

	declare void @_Z3mixRjj(i32* dereferenceable(4), i32)
	declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
	declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)

	define i32 @_Z3fooPKcjj(i8* nocapture readonly %s, i32 %len, i32 %c) {
	+; CHECK-LABEL: @_Z3fooPKcjj(
	+; CHECK-NOT: udiv
	entry:
	%a = alloca i32, align 4
	%tmp = bitcast i32* %a to i8*
	call void @llvm.lifetime.start.p0i8(i64 4, i8* %tmp)
	store i32 -1640531527, i32* %a, align 4
	%cmp8 = icmp ugt i32 %len, 11
	br i1 %cmp8, label %while.body.lr.ph, label %while.end

	while.body.lr.ph: ; preds = %entry
	br label %while.body

	while.body: ; preds = %while.body, %while.body.lr.ph
	%keylen.010 = phi i32 [ %len, %while.body.lr.ph ], [ %sub, %while.body ]
	%s.addr.09 = phi i8* [ %s, %while.body.lr.ph ], [ %add.ptr, %while.body ]
	%tmp1 = bitcast i8* %s.addr.09 to i32*
	%tmp2 = load i32, i32* %tmp1, align 4
	%shl.i = shl i32 %tmp2, 1
	%and.i = and i32 %shl.i, 16843008
	%tmp3 = load i32, i32* %a, align 4
	%sub.i = add i32 %tmp3, %tmp2
	%add = sub i32 %sub.i, %and.i
	store i32 %add, i32* %a, align 4
	%add.ptr = getelementptr inbounds i8, i8* %s.addr.09, i64 12
	%sub = add i32 %keylen.010, -12
	%cmp = icmp ugt i32 %sub, 11
	br i1 %cmp, label %while.body, label %while.cond.while.end_crit_edge

	while.cond.while.end_crit_edge: ; preds = %while.body
	%sub.lcssa = phi i32 [ %sub, %while.body ]
	br label %while.end

	while.end: ; preds = %while.cond.while.end_crit_edge, %entry
	%keylen.0.lcssa = phi i32 [ %sub.lcssa, %while.cond.while.end_crit_edge ], [ %len, %entry ]
	call void @_Z3mixRjj(i32* dereferenceable(4) %a, i32 %keylen.0.lcssa)
	%tmp4 = load i32, i32* %a, align 4
	call void @llvm.lifetime.end.p0i8(i64 4, i8* %tmp)
	ret i32 %tmp4
	}
	+
	+define i32 @zero_backedge_count_test(i32 %unknown_init, i32* %unknown_mem) {
	+; CHECK-LABEL: @zero_backedge_count_test(
	+entry:
	+ br label %loop
	+
	+loop:
	+ %iv = phi i32 [ 0, %entry], [ %iv.inc, %loop ]
	+ %unknown_phi = phi i32 [ %unknown_init, %entry ], [ %unknown_next, %loop ]
	+ %iv.inc = add i32 %iv, 1
	+ %be_taken = icmp ne i32 %iv.inc, 1
	+ %unknown_next = load volatile i32, i32* %unknown_mem
	+ br i1 %be_taken, label %loop, label %leave
	+
	+leave:
	+; We can fold %unknown_phi even though the backedge value for it is completely
	+; unknown, since we can prove that the loop's backedge taken count is 0.
	+
	+; CHECK: leave:
	+; CHECK: ret i32 %unknown_init
	+ %exit_val = phi i32 [ %unknown_phi, %loop ]
	+ ret i32 %exit_val
	+}
	diff --git a/test/Transforms/SimplifyCFG/pr34131.ll b/test/Transforms/SimplifyCFG/pr34131.ll
	new file mode 100644
	index 000000000000..b64b6876e04e
	--- /dev/null
	+++ b/test/Transforms/SimplifyCFG/pr34131.ll
	@@ -0,0 +1,74 @@
	+; RUN: opt -simplifycfg -S < %s \| FileCheck %s
	+
	+; Just checking for lack of crash here, but we should be able to check the IR?
	+; Earlier version using auto-generated checks from utils/update_test_checks.py
	+; had bot problems though...
	+
	+define void @patatino() {
	+
	+; CHECK-LABEL: @patatino
	+
	+ br label %bb1
	+bb1: ; preds = %bb36, %0
	+ br label %bb2
	+bb2: ; preds = %bb3, %bb1
	+ br i1 undef, label %bb4, label %bb3
	+bb3: ; preds = %bb4, %bb2
	+ br i1 undef, label %bb2, label %bb5
	+bb4: ; preds = %bb2
	+ switch i32 undef, label %bb3 [
	+ ]
	+bb5: ; preds = %bb3
	+ br label %bb6
	+bb6: ; preds = %bb5
	+ br i1 undef, label %bb7, label %bb9
	+bb7: ; preds = %bb6
	+ %tmp = or i64 undef, 1
	+ %tmp8 = icmp ult i64 %tmp, 0
	+ br i1 %tmp8, label %bb12, label %bb9
	+bb9: ; preds = %bb35, %bb34, %bb33, %bb32, %bb31, %bb30, %bb27, %bb24, %bb21, %bb18, %bb16, %bb14, %bb12, %bb7, %bb6
	+ br label %bb11
	+bb10: ; preds = %bb36
	+ br label %bb11
	+bb11: ; preds = %bb10, %bb9
	+ ret void
	+bb12: ; preds = %bb7
	+ %tmp13 = icmp ult i64 0, 0
	+ br i1 %tmp13, label %bb14, label %bb9
	+bb14: ; preds = %bb12
	+ %tmp15 = icmp ult i64 undef, 0
	+ br i1 %tmp15, label %bb16, label %bb9
	+bb16: ; preds = %bb14
	+ %tmp17 = icmp ult i64 undef, 0
	+ br i1 %tmp17, label %bb18, label %bb9
	+bb18: ; preds = %bb16
	+ %tmp19 = or i64 undef, 5
	+ %tmp20 = icmp ult i64 %tmp19, 0
	+ br i1 %tmp20, label %bb21, label %bb9
	+bb21: ; preds = %bb18
	+ %tmp22 = or i64 undef, 6
	+ %tmp23 = icmp ult i64 %tmp22, 0
	+ br i1 %tmp23, label %bb24, label %bb9
	+bb24: ; preds = %bb21
	+ %tmp25 = or i64 undef, 7
	+ %tmp26 = icmp ult i64 %tmp25, 0
	+ br i1 %tmp26, label %bb27, label %bb9
	+bb27: ; preds = %bb24
	+ %tmp28 = or i64 undef, 8
	+ %tmp29 = icmp ult i64 %tmp28, 0
	+ br i1 %tmp29, label %bb30, label %bb9
	+bb30: ; preds = %bb27
	+ br i1 undef, label %bb31, label %bb9
	+bb31: ; preds = %bb30
	+ br i1 undef, label %bb32, label %bb9
	+bb32: ; preds = %bb31
	+ br i1 undef, label %bb33, label %bb9
	+bb33: ; preds = %bb32
	+ br i1 undef, label %bb34, label %bb9
	+bb34: ; preds = %bb33
	+ br i1 undef, label %bb35, label %bb9
	+bb35: ; preds = %bb34
	+ br i1 undef, label %bb36, label %bb9
	+bb36: ; preds = %bb35
	+ br i1 undef, label %bb1, label %bb10
	+}
	diff --git a/tools/llvm-objdump/llvm-objdump.cpp b/tools/llvm-objdump/llvm-objdump.cpp
	index d54b45515f05..74593e6202aa 100644
	--- a/tools/llvm-objdump/llvm-objdump.cpp
	+++ b/tools/llvm-objdump/llvm-objdump.cpp
	@@ -1,2212 +1,2212 @@
	//===-- llvm-objdump.cpp - Object file dumping utility for llvm -----------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This program is a utility that works like binutils "objdump", that is, it
	// dumps out a plethora of information about an object file depending on the
	// flags.
	//
	// The flags and output of this program should be near identical to those of
	// binutils objdump.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm-objdump.h"
	#include "llvm/ADT/Optional.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/StringExtras.h"
	#include "llvm/ADT/Triple.h"
	#include "llvm/CodeGen/FaultMaps.h"
	#include "llvm/DebugInfo/DWARF/DWARFContext.h"
	#include "llvm/DebugInfo/Symbolize/Symbolize.h"
	#include "llvm/MC/MCAsmInfo.h"
	#include "llvm/MC/MCContext.h"
	#include "llvm/MC/MCDisassembler/MCDisassembler.h"
	#include "llvm/MC/MCDisassembler/MCRelocationInfo.h"
	#include "llvm/MC/MCInst.h"
	#include "llvm/MC/MCInstPrinter.h"
	#include "llvm/MC/MCInstrAnalysis.h"
	#include "llvm/MC/MCInstrInfo.h"
	#include "llvm/MC/MCObjectFileInfo.h"
	#include "llvm/MC/MCRegisterInfo.h"
	#include "llvm/MC/MCSubtargetInfo.h"
	#include "llvm/Object/Archive.h"
	#include "llvm/Object/COFF.h"
	#include "llvm/Object/COFFImportFile.h"
	#include "llvm/Object/ELFObjectFile.h"
	#include "llvm/Object/MachO.h"
	#include "llvm/Object/ObjectFile.h"
	#include "llvm/Object/Wasm.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/Errc.h"
	#include "llvm/Support/FileSystem.h"
	#include "llvm/Support/Format.h"
	#include "llvm/Support/GraphWriter.h"
	#include "llvm/Support/Host.h"
	#include "llvm/Support/ManagedStatic.h"
	#include "llvm/Support/MemoryBuffer.h"
	#include "llvm/Support/PrettyStackTrace.h"
	#include "llvm/Support/Signals.h"
	#include "llvm/Support/SourceMgr.h"
	#include "llvm/Support/TargetRegistry.h"
	#include "llvm/Support/TargetSelect.h"
	#include "llvm/Support/raw_ostream.h"
	#include <algorithm>
	#include <cctype>
	#include <cstring>
	#include <system_error>
	#include <utility>
	#include <unordered_map>

	using namespace llvm;
	using namespace object;

	static cl::list<std::string>
	InputFilenames(cl::Positional, cl::desc("<input object files>"),cl::ZeroOrMore);

	cl::opt<bool>
	llvm::Disassemble("disassemble",
	cl::desc("Display assembler mnemonics for the machine instructions"));
	static cl::alias
	Disassembled("d", cl::desc("Alias for --disassemble"),
	cl::aliasopt(Disassemble));

	cl::opt<bool>
	llvm::DisassembleAll("disassemble-all",
	cl::desc("Display assembler mnemonics for the machine instructions"));
	static cl::alias
	DisassembleAlld("D", cl::desc("Alias for --disassemble-all"),
	cl::aliasopt(DisassembleAll));

	cl::opt<bool>
	llvm::Relocations("r", cl::desc("Display the relocation entries in the file"));

	cl::opt<bool>
	llvm::SectionContents("s", cl::desc("Display the content of each section"));

	cl::opt<bool>
	llvm::SymbolTable("t", cl::desc("Display the symbol table"));

	cl::opt<bool>
	llvm::ExportsTrie("exports-trie", cl::desc("Display mach-o exported symbols"));

	cl::opt<bool>
	llvm::Rebase("rebase", cl::desc("Display mach-o rebasing info"));

	cl::opt<bool>
	llvm::Bind("bind", cl::desc("Display mach-o binding info"));

	cl::opt<bool>
	llvm::LazyBind("lazy-bind", cl::desc("Display mach-o lazy binding info"));

	cl::opt<bool>
	llvm::WeakBind("weak-bind", cl::desc("Display mach-o weak binding info"));

	cl::opt<bool>
	llvm::RawClangAST("raw-clang-ast",
	cl::desc("Dump the raw binary contents of the clang AST section"));

	static cl::opt<bool>
	MachOOpt("macho", cl::desc("Use MachO specific object file parser"));
	static cl::alias
	MachOm("m", cl::desc("Alias for --macho"), cl::aliasopt(MachOOpt));

	cl::opt<std::string>
	llvm::TripleName("triple", cl::desc("Target triple to disassemble for, "
	"see -version for available targets"));

	cl::opt<std::string>
	llvm::MCPU("mcpu",
	cl::desc("Target a specific cpu type (-mcpu=help for details)"),
	cl::value_desc("cpu-name"),
	cl::init(""));

	cl::opt<std::string>
	llvm::ArchName("arch-name", cl::desc("Target arch to disassemble for, "
	"see -version for available targets"));

	cl::opt<bool>
	llvm::SectionHeaders("section-headers", cl::desc("Display summaries of the "
	"headers for each section."));
	static cl::alias
	SectionHeadersShort("headers", cl::desc("Alias for --section-headers"),
	cl::aliasopt(SectionHeaders));
	static cl::alias
	SectionHeadersShorter("h", cl::desc("Alias for --section-headers"),
	cl::aliasopt(SectionHeaders));

	cl::list<std::string>
	llvm::FilterSections("section", cl::desc("Operate on the specified sections only. "
	"With -macho dump segment,section"));
	cl::alias
	static FilterSectionsj("j", cl::desc("Alias for --section"),
	cl::aliasopt(llvm::FilterSections));

	cl::list<std::string>
	llvm::MAttrs("mattr",
	cl::CommaSeparated,
	cl::desc("Target specific attributes"),
	cl::value_desc("a1,+a2,-a3,..."));

	cl::opt<bool>
	llvm::NoShowRawInsn("no-show-raw-insn", cl::desc("When disassembling "
	"instructions, do not print "
	"the instruction bytes."));
	cl::opt<bool>
	llvm::NoLeadingAddr("no-leading-addr", cl::desc("Print no leading address"));

	cl::opt<bool>
	llvm::UnwindInfo("unwind-info", cl::desc("Display unwind information"));

	static cl::alias
	UnwindInfoShort("u", cl::desc("Alias for --unwind-info"),
	cl::aliasopt(UnwindInfo));

	cl::opt<bool>
	llvm::PrivateHeaders("private-headers",
	cl::desc("Display format specific file headers"));

	cl::opt<bool>
	llvm::FirstPrivateHeader("private-header",
	cl::desc("Display only the first format specific file "
	"header"));

	static cl::alias
	PrivateHeadersShort("p", cl::desc("Alias for --private-headers"),
	cl::aliasopt(PrivateHeaders));

	cl::opt<bool>
	llvm::PrintImmHex("print-imm-hex",
	cl::desc("Use hex format for immediate values"));

	cl::opt<bool> PrintFaultMaps("fault-map-section",
	cl::desc("Display contents of faultmap section"));

	cl::opt<DIDumpType> llvm::DwarfDumpType(
	"dwarf", cl::init(DIDT_Null), cl::desc("Dump of dwarf debug sections:"),
	cl::values(clEnumValN(DIDT_Frames, "frames", ".debug_frame")));

	cl::opt<bool> PrintSource(
	"source",
	cl::desc(
	"Display source inlined with disassembly. Implies disassmble object"));

	cl::alias PrintSourceShort("S", cl::desc("Alias for -source"),
	cl::aliasopt(PrintSource));

	cl::opt<bool> PrintLines("line-numbers",
	cl::desc("Display source line numbers with "
	"disassembly. Implies disassemble object"));

	cl::alias PrintLinesShort("l", cl::desc("Alias for -line-numbers"),
	cl::aliasopt(PrintLines));

	cl::opt<unsigned long long>
	StartAddress("start-address", cl::desc("Disassemble beginning at address"),
	cl::value_desc("address"), cl::init(0));
	cl::opt<unsigned long long>
	StopAddress("stop-address", cl::desc("Stop disassembly at address"),
	cl::value_desc("address"), cl::init(UINT64_MAX));
	static StringRef ToolName;

	typedef std::vector<std::tuple<uint64_t, StringRef, uint8_t>> SectionSymbolsTy;

	namespace {
	typedef std::function<bool(llvm::object::SectionRef const &)> FilterPredicate;

	class SectionFilterIterator {
	public:
	SectionFilterIterator(FilterPredicate P,
	llvm::object::section_iterator const &I,
	llvm::object::section_iterator const &E)
	: Predicate(std::move(P)), Iterator(I), End(E) {
	ScanPredicate();
	}
	const llvm::object::SectionRef &operator() const { return Iterator; }
	SectionFilterIterator &operator++() {
	++Iterator;
	ScanPredicate();
	return *this;
	}
	bool operator!=(SectionFilterIterator const &Other) const {
	return Iterator != Other.Iterator;
	}

	private:
	void ScanPredicate() {
	while (Iterator != End && !Predicate(*Iterator)) {
	++Iterator;
	}
	}
	FilterPredicate Predicate;
	llvm::object::section_iterator Iterator;
	llvm::object::section_iterator End;
	};

	class SectionFilter {
	public:
	SectionFilter(FilterPredicate P, llvm::object::ObjectFile const &O)
	: Predicate(std::move(P)), Object(O) {}
	SectionFilterIterator begin() {
	return SectionFilterIterator(Predicate, Object.section_begin(),
	Object.section_end());
	}
	SectionFilterIterator end() {
	return SectionFilterIterator(Predicate, Object.section_end(),
	Object.section_end());
	}

	private:
	FilterPredicate Predicate;
	llvm::object::ObjectFile const &Object;
	};
	SectionFilter ToolSectionFilter(llvm::object::ObjectFile const &O) {
	return SectionFilter(
	[](llvm::object::SectionRef const &S) {
	if (FilterSections.empty())
	return true;
	llvm::StringRef String;
	std::error_code error = S.getName(String);
	if (error)
	return false;
	return is_contained(FilterSections, String);
	},
	O);
	}
	}

	void llvm::error(std::error_code EC) {
	if (!EC)
	return;

	errs() << ToolName << ": error reading file: " << EC.message() << ".\n";
	errs().flush();
	exit(1);
	}

	LLVM_ATTRIBUTE_NORETURN void llvm::error(Twine Message) {
	errs() << ToolName << ": " << Message << ".\n";
	errs().flush();
	exit(1);
	}

	LLVM_ATTRIBUTE_NORETURN void llvm::report_error(StringRef File,
	Twine Message) {
	errs() << ToolName << ": '" << File << "': " << Message << ".\n";
	exit(1);
	}

	LLVM_ATTRIBUTE_NORETURN void llvm::report_error(StringRef File,
	std::error_code EC) {
	assert(EC);
	errs() << ToolName << ": '" << File << "': " << EC.message() << ".\n";
	exit(1);
	}

	LLVM_ATTRIBUTE_NORETURN void llvm::report_error(StringRef File,
	llvm::Error E) {
	assert(E);
	std::string Buf;
	raw_string_ostream OS(Buf);
	logAllUnhandledErrors(std::move(E), OS, "");
	OS.flush();
	errs() << ToolName << ": '" << File << "': " << Buf;
	exit(1);
	}

	LLVM_ATTRIBUTE_NORETURN void llvm::report_error(StringRef ArchiveName,
	StringRef FileName,
	llvm::Error E,
	StringRef ArchitectureName) {
	assert(E);
	errs() << ToolName << ": ";
	if (ArchiveName != "")
	errs() << ArchiveName << "(" << FileName << ")";
	else
	errs() << "'" << FileName << "'";
	if (!ArchitectureName.empty())
	errs() << " (for architecture " << ArchitectureName << ")";
	std::string Buf;
	raw_string_ostream OS(Buf);
	logAllUnhandledErrors(std::move(E), OS, "");
	OS.flush();
	errs() << ": " << Buf;
	exit(1);
	}

	LLVM_ATTRIBUTE_NORETURN void llvm::report_error(StringRef ArchiveName,
	const object::Archive::Child &C,
	llvm::Error E,
	StringRef ArchitectureName) {
	Expected<StringRef> NameOrErr = C.getName();
	// TODO: if we have a error getting the name then it would be nice to print
	// the index of which archive member this is and or its offset in the
	// archive instead of "???" as the name.
	if (!NameOrErr) {
	consumeError(NameOrErr.takeError());
	llvm::report_error(ArchiveName, "???", std::move(E), ArchitectureName);
	} else
	llvm::report_error(ArchiveName, NameOrErr.get(), std::move(E),
	ArchitectureName);
	}

	static const Target getTarget(const ObjectFile Obj = nullptr) {
	// Figure out the target triple.
	llvm::Triple TheTriple("unknown-unknown-unknown");
	if (TripleName.empty()) {
	if (Obj) {
	auto Arch = Obj->getArch();
	TheTriple.setArch(Triple::ArchType(Arch));

	// For ARM targets, try to use the build attributes to build determine
	// the build target. Target features are also added, but later during
	// disassembly.
	if (Arch == Triple::arm \|\| Arch == Triple::armeb) {
	Obj->setARMSubArch(TheTriple);
	}

	// TheTriple defaults to ELF, and COFF doesn't have an environment:
	// the best we can do here is indicate that it is mach-o.
	if (Obj->isMachO())
	TheTriple.setObjectFormat(Triple::MachO);

	if (Obj->isCOFF()) {
	const auto COFFObj = dyn_cast<COFFObjectFile>(Obj);
	if (COFFObj->getArch() == Triple::thumb)
	TheTriple.setTriple("thumbv7-windows");
	}
	}
	} else {
	TheTriple.setTriple(Triple::normalize(TripleName));
	// Use the triple, but also try to combine with ARM build attributes.
	if (Obj) {
	auto Arch = Obj->getArch();
	if (Arch == Triple::arm \|\| Arch == Triple::armeb) {
	Obj->setARMSubArch(TheTriple);
	}
	}
	}

	// Get the target specific parser.
	std::string Error;
	const Target *TheTarget = TargetRegistry::lookupTarget(ArchName, TheTriple,
	Error);
	if (!TheTarget) {
	if (Obj)
	report_error(Obj->getFileName(), "can't find target: " + Error);
	else
	error("can't find target: " + Error);
	}

	// Update the triple name and return the found target.
	TripleName = TheTriple.getTriple();
	return TheTarget;
	}

	bool llvm::RelocAddressLess(RelocationRef a, RelocationRef b) {
	return a.getOffset() < b.getOffset();
	}

	namespace {
	class SourcePrinter {
	protected:
	DILineInfo OldLineInfo;
	const ObjectFile *Obj;
	std::unique_ptr<symbolize::LLVMSymbolizer> Symbolizer;
	// File name to file contents of source
	std::unordered_map<std::string, std::unique_ptr<MemoryBuffer>> SourceCache;
	// Mark the line endings of the cached source
	std::unordered_map<std::string, std::vector<StringRef>> LineCache;

	private:
	bool cacheSource(std::string File);

	public:
	virtual ~SourcePrinter() {}
	SourcePrinter() : Obj(nullptr), Symbolizer(nullptr) {}
	SourcePrinter(const ObjectFile *Obj, StringRef DefaultArch) : Obj(Obj) {
	symbolize::LLVMSymbolizer::Options SymbolizerOpts(
	DILineInfoSpecifier::FunctionNameKind::None, true, false, false,
	DefaultArch);
	Symbolizer.reset(new symbolize::LLVMSymbolizer(SymbolizerOpts));
	}
	virtual void printSourceLine(raw_ostream &OS, uint64_t Address,
	StringRef Delimiter = "; ");
	};

	bool SourcePrinter::cacheSource(std::string File) {
	auto BufferOrError = MemoryBuffer::getFile(File);
	if (!BufferOrError)
	return false;
	// Chomp the file to get lines
	size_t BufferSize = (*BufferOrError)->getBufferSize();
	const char BufferStart = (BufferOrError)->getBufferStart();
	for (const char Start = BufferStart, End = BufferStart;
	End < BufferStart + BufferSize; End++)
	if (*End == '\n' \|\| End == BufferStart + BufferSize - 1 \|\|
	(End == '\r' && (End + 1) == '\n')) {
	LineCache[File].push_back(StringRef(Start, End - Start));
	if (*End == '\r')
	End++;
	Start = End + 1;
	}
	SourceCache[File] = std::move(*BufferOrError);
	return true;
	}

	void SourcePrinter::printSourceLine(raw_ostream &OS, uint64_t Address,
	StringRef Delimiter) {
	if (!Symbolizer)
	return;
	DILineInfo LineInfo = DILineInfo();
	auto ExpectecLineInfo =
	Symbolizer->symbolizeCode(Obj->getFileName(), Address);
	if (!ExpectecLineInfo)
	consumeError(ExpectecLineInfo.takeError());
	else
	LineInfo = *ExpectecLineInfo;

	if ((LineInfo.FileName == "<invalid>") \|\| OldLineInfo.Line == LineInfo.Line \|\|
	LineInfo.Line == 0)
	return;

	if (PrintLines)
	OS << Delimiter << LineInfo.FileName << ":" << LineInfo.Line << "\n";
	if (PrintSource) {
	if (SourceCache.find(LineInfo.FileName) == SourceCache.end())
	if (!cacheSource(LineInfo.FileName))
	return;
	auto FileBuffer = SourceCache.find(LineInfo.FileName);
	if (FileBuffer != SourceCache.end()) {
	auto LineBuffer = LineCache.find(LineInfo.FileName);
	if (LineBuffer != LineCache.end()) {
	if (LineInfo.Line > LineBuffer->second.size())
	return;
	// Vector begins at 0, line numbers are non-zero
	OS << Delimiter << LineBuffer->second[LineInfo.Line - 1].ltrim()
	<< "\n";
	}
	}
	}
	OldLineInfo = LineInfo;
	}

	static bool isArmElf(const ObjectFile *Obj) {
	return (Obj->isELF() &&
	(Obj->getArch() == Triple::aarch64 \|\|
	Obj->getArch() == Triple::aarch64_be \|\|
	Obj->getArch() == Triple::arm \|\| Obj->getArch() == Triple::armeb \|\|
	Obj->getArch() == Triple::thumb \|\|
	Obj->getArch() == Triple::thumbeb));
	}

	class PrettyPrinter {
	public:
	virtual ~PrettyPrinter(){}
	virtual void printInst(MCInstPrinter &IP, const MCInst *MI,
	ArrayRef<uint8_t> Bytes, uint64_t Address,
	raw_ostream &OS, StringRef Annot,
	MCSubtargetInfo const &STI, SourcePrinter *SP) {
	if (SP && (PrintSource \|\| PrintLines))
	SP->printSourceLine(OS, Address);
	if (!NoLeadingAddr)
	OS << format("%8" PRIx64 ":", Address);
	if (!NoShowRawInsn) {
	OS << "\t";
	dumpBytes(Bytes, OS);
	}
	if (MI)
	IP.printInst(MI, OS, "", STI);
	else
	OS << " <unknown>";
	}
	};
	PrettyPrinter PrettyPrinterInst;
	class HexagonPrettyPrinter : public PrettyPrinter {
	public:
	void printLead(ArrayRef<uint8_t> Bytes, uint64_t Address,
	raw_ostream &OS) {
	uint32_t opcode =
	(Bytes[3] << 24) \| (Bytes[2] << 16) \| (Bytes[1] << 8) \| Bytes[0];
	if (!NoLeadingAddr)
	OS << format("%8" PRIx64 ":", Address);
	if (!NoShowRawInsn) {
	OS << "\t";
	dumpBytes(Bytes.slice(0, 4), OS);
	OS << format("%08" PRIx32, opcode);
	}
	}
	void printInst(MCInstPrinter &IP, const MCInst *MI, ArrayRef<uint8_t> Bytes,
	uint64_t Address, raw_ostream &OS, StringRef Annot,
	MCSubtargetInfo const &STI, SourcePrinter *SP) override {
	if (SP && (PrintSource \|\| PrintLines))
	SP->printSourceLine(OS, Address, "");
	if (!MI) {
	printLead(Bytes, Address, OS);
	OS << " <unknown>";
	return;
	}
	std::string Buffer;
	{
	raw_string_ostream TempStream(Buffer);
	IP.printInst(MI, TempStream, "", STI);
	}
	StringRef Contents(Buffer);
	// Split off bundle attributes
	auto PacketBundle = Contents.rsplit('\n');
	// Split off first instruction from the rest
	auto HeadTail = PacketBundle.first.split('\n');
	auto Preamble = " { ";
	auto Separator = "";
	while(!HeadTail.first.empty()) {
	OS << Separator;
	Separator = "\n";
	if (SP && (PrintSource \|\| PrintLines))
	SP->printSourceLine(OS, Address, "");
	printLead(Bytes, Address, OS);
	OS << Preamble;
	Preamble = " ";
	StringRef Inst;
	auto Duplex = HeadTail.first.split('\v');
	if(!Duplex.second.empty()){
	OS << Duplex.first;
	OS << "; ";
	Inst = Duplex.second;
	}
	else
	Inst = HeadTail.first;
	OS << Inst;
	Bytes = Bytes.slice(4);
	Address += 4;
	HeadTail = HeadTail.second.split('\n');
	}
	OS << " } " << PacketBundle.second;
	}
	};
	HexagonPrettyPrinter HexagonPrettyPrinterInst;

	class AMDGCNPrettyPrinter : public PrettyPrinter {
	public:
	void printInst(MCInstPrinter &IP, const MCInst *MI, ArrayRef<uint8_t> Bytes,
	uint64_t Address, raw_ostream &OS, StringRef Annot,
	MCSubtargetInfo const &STI, SourcePrinter *SP) override {
	if (SP && (PrintSource \|\| PrintLines))
	SP->printSourceLine(OS, Address);

	if (!MI) {
	OS << " <unknown>";
	return;
	}

	SmallString<40> InstStr;
	raw_svector_ostream IS(InstStr);

	IP.printInst(MI, IS, "", STI);

	OS << left_justify(IS.str(), 60) << format("// %012" PRIX64 ": ", Address);
	typedef support::ulittle32_t U32;
	for (auto D : makeArrayRef(reinterpret_cast<const U32*>(Bytes.data()),
	Bytes.size() / sizeof(U32)))
	// D should be explicitly casted to uint32_t here as it is passed
	// by format to snprintf as vararg.
	OS << format("%08" PRIX32 " ", static_cast<uint32_t>(D));

	if (!Annot.empty())
	OS << "// " << Annot;
	}
	};
	AMDGCNPrettyPrinter AMDGCNPrettyPrinterInst;

	class BPFPrettyPrinter : public PrettyPrinter {
	public:
	void printInst(MCInstPrinter &IP, const MCInst *MI, ArrayRef<uint8_t> Bytes,
	uint64_t Address, raw_ostream &OS, StringRef Annot,
	MCSubtargetInfo const &STI, SourcePrinter *SP) override {
	if (SP && (PrintSource \|\| PrintLines))
	SP->printSourceLine(OS, Address);
	if (!NoLeadingAddr)
	OS << format("%8" PRId64 ":", Address / 8);
	if (!NoShowRawInsn) {
	OS << "\t";
	dumpBytes(Bytes, OS);
	}
	if (MI)
	IP.printInst(MI, OS, "", STI);
	else
	OS << " <unknown>";
	}
	};
	BPFPrettyPrinter BPFPrettyPrinterInst;

	PrettyPrinter &selectPrettyPrinter(Triple const &Triple) {
	switch(Triple.getArch()) {
	default:
	return PrettyPrinterInst;
	case Triple::hexagon:
	return HexagonPrettyPrinterInst;
	case Triple::amdgcn:
	return AMDGCNPrettyPrinterInst;
	case Triple::bpfel:
	case Triple::bpfeb:
	return BPFPrettyPrinterInst;
	}
	}
	}

	template <class ELFT>
	static std::error_code getRelocationValueString(const ELFObjectFile<ELFT> *Obj,
	const RelocationRef &RelRef,
	SmallVectorImpl<char> &Result) {
	DataRefImpl Rel = RelRef.getRawDataRefImpl();

	typedef typename ELFObjectFile<ELFT>::Elf_Sym Elf_Sym;
	typedef typename ELFObjectFile<ELFT>::Elf_Shdr Elf_Shdr;
	typedef typename ELFObjectFile<ELFT>::Elf_Rela Elf_Rela;

	const ELFFile<ELFT> &EF = *Obj->getELFFile();

	auto SecOrErr = EF.getSection(Rel.d.a);
	if (!SecOrErr)
	return errorToErrorCode(SecOrErr.takeError());
	const Elf_Shdr Sec = SecOrErr;
	auto SymTabOrErr = EF.getSection(Sec->sh_link);
	if (!SymTabOrErr)
	return errorToErrorCode(SymTabOrErr.takeError());
	const Elf_Shdr SymTab = SymTabOrErr;
	assert(SymTab->sh_type == ELF::SHT_SYMTAB \|\|
	SymTab->sh_type == ELF::SHT_DYNSYM);
	auto StrTabSec = EF.getSection(SymTab->sh_link);
	if (!StrTabSec)
	return errorToErrorCode(StrTabSec.takeError());
	auto StrTabOrErr = EF.getStringTable(*StrTabSec);
	if (!StrTabOrErr)
	return errorToErrorCode(StrTabOrErr.takeError());
	StringRef StrTab = *StrTabOrErr;
	uint8_t type = RelRef.getType();
	StringRef res;
	int64_t addend = 0;
	switch (Sec->sh_type) {
	default:
	return object_error::parse_failed;
	case ELF::SHT_REL: {
	// TODO: Read implicit addend from section data.
	break;
	}
	case ELF::SHT_RELA: {
	const Elf_Rela *ERela = Obj->getRela(Rel);
	addend = ERela->r_addend;
	break;
	}
	}
	symbol_iterator SI = RelRef.getSymbol();
	const Elf_Sym *symb = Obj->getSymbol(SI->getRawDataRefImpl());
	StringRef Target;
	if (symb->getType() == ELF::STT_SECTION) {
	Expected<section_iterator> SymSI = SI->getSection();
	if (!SymSI)
	return errorToErrorCode(SymSI.takeError());
	const Elf_Shdr SymSec = Obj->getSection((SymSI)->getRawDataRefImpl());
	auto SecName = EF.getSectionName(SymSec);
	if (!SecName)
	return errorToErrorCode(SecName.takeError());
	Target = *SecName;
	} else {
	Expected<StringRef> SymName = symb->getName(StrTab);
	if (!SymName)
	return errorToErrorCode(SymName.takeError());
	Target = *SymName;
	}
	switch (EF.getHeader()->e_machine) {
	case ELF::EM_X86_64:
	switch (type) {
	case ELF::R_X86_64_PC8:
	case ELF::R_X86_64_PC16:
	case ELF::R_X86_64_PC32: {
	std::string fmtbuf;
	raw_string_ostream fmt(fmtbuf);
	fmt << Target << (addend < 0 ? "" : "+") << addend << "-P";
	fmt.flush();
	Result.append(fmtbuf.begin(), fmtbuf.end());
	} break;
	case ELF::R_X86_64_8:
	case ELF::R_X86_64_16:
	case ELF::R_X86_64_32:
	case ELF::R_X86_64_32S:
	case ELF::R_X86_64_64: {
	std::string fmtbuf;
	raw_string_ostream fmt(fmtbuf);
	fmt << Target << (addend < 0 ? "" : "+") << addend;
	fmt.flush();
	Result.append(fmtbuf.begin(), fmtbuf.end());
	} break;
	default:
	res = "Unknown";
	}
	break;
	case ELF::EM_LANAI:
	case ELF::EM_AVR:
	case ELF::EM_AARCH64: {
	std::string fmtbuf;
	raw_string_ostream fmt(fmtbuf);
	fmt << Target;
	if (addend != 0)
	fmt << (addend < 0 ? "" : "+") << addend;
	fmt.flush();
	Result.append(fmtbuf.begin(), fmtbuf.end());
	break;
	}
	case ELF::EM_386:
	case ELF::EM_IAMCU:
	case ELF::EM_ARM:
	case ELF::EM_HEXAGON:
	case ELF::EM_MIPS:
	case ELF::EM_BPF:
	case ELF::EM_RISCV:
	res = Target;
	break;
	case ELF::EM_WEBASSEMBLY:
	switch (type) {
	case ELF::R_WEBASSEMBLY_DATA: {
	std::string fmtbuf;
	raw_string_ostream fmt(fmtbuf);
	fmt << Target << (addend < 0 ? "" : "+") << addend;
	fmt.flush();
	Result.append(fmtbuf.begin(), fmtbuf.end());
	break;
	}
	case ELF::R_WEBASSEMBLY_FUNCTION:
	res = Target;
	break;
	default:
	res = "Unknown";
	}
	break;
	default:
	res = "Unknown";
	}
	if (Result.empty())
	Result.append(res.begin(), res.end());
	return std::error_code();
	}

	static std::error_code getRelocationValueString(const ELFObjectFileBase *Obj,
	const RelocationRef &Rel,
	SmallVectorImpl<char> &Result) {
	if (auto *ELF32LE = dyn_cast<ELF32LEObjectFile>(Obj))
	return getRelocationValueString(ELF32LE, Rel, Result);
	if (auto *ELF64LE = dyn_cast<ELF64LEObjectFile>(Obj))
	return getRelocationValueString(ELF64LE, Rel, Result);
	if (auto *ELF32BE = dyn_cast<ELF32BEObjectFile>(Obj))
	return getRelocationValueString(ELF32BE, Rel, Result);
	auto *ELF64BE = cast<ELF64BEObjectFile>(Obj);
	return getRelocationValueString(ELF64BE, Rel, Result);
	}

	static std::error_code getRelocationValueString(const COFFObjectFile *Obj,
	const RelocationRef &Rel,
	SmallVectorImpl<char> &Result) {
	symbol_iterator SymI = Rel.getSymbol();
	Expected<StringRef> SymNameOrErr = SymI->getName();
	if (!SymNameOrErr)
	return errorToErrorCode(SymNameOrErr.takeError());
	StringRef SymName = *SymNameOrErr;
	Result.append(SymName.begin(), SymName.end());
	return std::error_code();
	}

	static void printRelocationTargetName(const MachOObjectFile *O,
	const MachO::any_relocation_info &RE,
	raw_string_ostream &fmt) {
	bool IsScattered = O->isRelocationScattered(RE);

	// Target of a scattered relocation is an address. In the interest of
	// generating pretty output, scan through the symbol table looking for a
	// symbol that aligns with that address. If we find one, print it.
	// Otherwise, we just print the hex address of the target.
	if (IsScattered) {
	uint32_t Val = O->getPlainRelocationSymbolNum(RE);

	for (const SymbolRef &Symbol : O->symbols()) {
	std::error_code ec;
	Expected<uint64_t> Addr = Symbol.getAddress();
	if (!Addr)
	report_error(O->getFileName(), Addr.takeError());
	if (*Addr != Val)
	continue;
	Expected<StringRef> Name = Symbol.getName();
	if (!Name)
	report_error(O->getFileName(), Name.takeError());
	fmt << *Name;
	return;
	}

	// If we couldn't find a symbol that this relocation refers to, try
	// to find a section beginning instead.
	for (const SectionRef &Section : ToolSectionFilter(*O)) {
	std::error_code ec;

	StringRef Name;
	uint64_t Addr = Section.getAddress();
	if (Addr != Val)
	continue;
	if ((ec = Section.getName(Name)))
	report_error(O->getFileName(), ec);
	fmt << Name;
	return;
	}

	fmt << format("0x%x", Val);
	return;
	}

	StringRef S;
	bool isExtern = O->getPlainRelocationExternal(RE);
	uint64_t Val = O->getPlainRelocationSymbolNum(RE);

	if (O->getAnyRelocationType(RE) == MachO::ARM64_RELOC_ADDEND) {
	- fmt << format("0x%x", Val);
	+ fmt << format("0x%0" PRIx64, Val);
	return;
	} else if (isExtern) {
	symbol_iterator SI = O->symbol_begin();
	advance(SI, Val);
	Expected<StringRef> SOrErr = SI->getName();
	if (!SOrErr)
	report_error(O->getFileName(), SOrErr.takeError());
	S = *SOrErr;
	} else {
	section_iterator SI = O->section_begin();
	// Adjust for the fact that sections are 1-indexed.
	advance(SI, Val - 1);
	SI->getName(S);
	}

	fmt << S;
	}

	static std::error_code getRelocationValueString(const WasmObjectFile *Obj,
	const RelocationRef &RelRef,
	SmallVectorImpl<char> &Result) {
	const wasm::WasmRelocation& Rel = Obj->getWasmRelocation(RelRef);
	std::string fmtbuf;
	raw_string_ostream fmt(fmtbuf);
	fmt << Rel.Index << (Rel.Addend < 0 ? "" : "+") << Rel.Addend;
	fmt.flush();
	Result.append(fmtbuf.begin(), fmtbuf.end());
	return std::error_code();
	}

	static std::error_code getRelocationValueString(const MachOObjectFile *Obj,
	const RelocationRef &RelRef,
	SmallVectorImpl<char> &Result) {
	DataRefImpl Rel = RelRef.getRawDataRefImpl();
	MachO::any_relocation_info RE = Obj->getRelocation(Rel);

	unsigned Arch = Obj->getArch();

	std::string fmtbuf;
	raw_string_ostream fmt(fmtbuf);
	unsigned Type = Obj->getAnyRelocationType(RE);
	bool IsPCRel = Obj->getAnyRelocationPCRel(RE);

	// Determine any addends that should be displayed with the relocation.
	// These require decoding the relocation type, which is triple-specific.

	// X86_64 has entirely custom relocation types.
	if (Arch == Triple::x86_64) {
	bool isPCRel = Obj->getAnyRelocationPCRel(RE);

	switch (Type) {
	case MachO::X86_64_RELOC_GOT_LOAD:
	case MachO::X86_64_RELOC_GOT: {
	printRelocationTargetName(Obj, RE, fmt);
	fmt << "@GOT";
	if (isPCRel)
	fmt << "PCREL";
	break;
	}
	case MachO::X86_64_RELOC_SUBTRACTOR: {
	DataRefImpl RelNext = Rel;
	Obj->moveRelocationNext(RelNext);
	MachO::any_relocation_info RENext = Obj->getRelocation(RelNext);

	// X86_64_RELOC_SUBTRACTOR must be followed by a relocation of type
	// X86_64_RELOC_UNSIGNED.
	// NOTE: Scattered relocations don't exist on x86_64.
	unsigned RType = Obj->getAnyRelocationType(RENext);
	if (RType != MachO::X86_64_RELOC_UNSIGNED)
	report_error(Obj->getFileName(), "Expected X86_64_RELOC_UNSIGNED after "
	"X86_64_RELOC_SUBTRACTOR.");

	// The X86_64_RELOC_UNSIGNED contains the minuend symbol;
	// X86_64_RELOC_SUBTRACTOR contains the subtrahend.
	printRelocationTargetName(Obj, RENext, fmt);
	fmt << "-";
	printRelocationTargetName(Obj, RE, fmt);
	break;
	}
	case MachO::X86_64_RELOC_TLV:
	printRelocationTargetName(Obj, RE, fmt);
	fmt << "@TLV";
	if (isPCRel)
	fmt << "P";
	break;
	case MachO::X86_64_RELOC_SIGNED_1:
	printRelocationTargetName(Obj, RE, fmt);
	fmt << "-1";
	break;
	case MachO::X86_64_RELOC_SIGNED_2:
	printRelocationTargetName(Obj, RE, fmt);
	fmt << "-2";
	break;
	case MachO::X86_64_RELOC_SIGNED_4:
	printRelocationTargetName(Obj, RE, fmt);
	fmt << "-4";
	break;
	default:
	printRelocationTargetName(Obj, RE, fmt);
	break;
	}
	// X86 and ARM share some relocation types in common.
	} else if (Arch == Triple::x86 \|\| Arch == Triple::arm \|\|
	Arch == Triple::ppc) {
	// Generic relocation types...
	switch (Type) {
	case MachO::GENERIC_RELOC_PAIR: // prints no info
	return std::error_code();
	case MachO::GENERIC_RELOC_SECTDIFF: {
	DataRefImpl RelNext = Rel;
	Obj->moveRelocationNext(RelNext);
	MachO::any_relocation_info RENext = Obj->getRelocation(RelNext);

	// X86 sect diff's must be followed by a relocation of type
	// GENERIC_RELOC_PAIR.
	unsigned RType = Obj->getAnyRelocationType(RENext);

	if (RType != MachO::GENERIC_RELOC_PAIR)
	report_error(Obj->getFileName(), "Expected GENERIC_RELOC_PAIR after "
	"GENERIC_RELOC_SECTDIFF.");

	printRelocationTargetName(Obj, RE, fmt);
	fmt << "-";
	printRelocationTargetName(Obj, RENext, fmt);
	break;
	}
	}

	if (Arch == Triple::x86 \|\| Arch == Triple::ppc) {
	switch (Type) {
	case MachO::GENERIC_RELOC_LOCAL_SECTDIFF: {
	DataRefImpl RelNext = Rel;
	Obj->moveRelocationNext(RelNext);
	MachO::any_relocation_info RENext = Obj->getRelocation(RelNext);

	// X86 sect diff's must be followed by a relocation of type
	// GENERIC_RELOC_PAIR.
	unsigned RType = Obj->getAnyRelocationType(RENext);
	if (RType != MachO::GENERIC_RELOC_PAIR)
	report_error(Obj->getFileName(), "Expected GENERIC_RELOC_PAIR after "
	"GENERIC_RELOC_LOCAL_SECTDIFF.");

	printRelocationTargetName(Obj, RE, fmt);
	fmt << "-";
	printRelocationTargetName(Obj, RENext, fmt);
	break;
	}
	case MachO::GENERIC_RELOC_TLV: {
	printRelocationTargetName(Obj, RE, fmt);
	fmt << "@TLV";
	if (IsPCRel)
	fmt << "P";
	break;
	}
	default:
	printRelocationTargetName(Obj, RE, fmt);
	}
	} else { // ARM-specific relocations
	switch (Type) {
	case MachO::ARM_RELOC_HALF:
	case MachO::ARM_RELOC_HALF_SECTDIFF: {
	// Half relocations steal a bit from the length field to encode
	// whether this is an upper16 or a lower16 relocation.
	bool isUpper = (Obj->getAnyRelocationLength(RE) & 0x1) == 1;

	if (isUpper)
	fmt << ":upper16:(";
	else
	fmt << ":lower16:(";
	printRelocationTargetName(Obj, RE, fmt);

	DataRefImpl RelNext = Rel;
	Obj->moveRelocationNext(RelNext);
	MachO::any_relocation_info RENext = Obj->getRelocation(RelNext);

	// ARM half relocs must be followed by a relocation of type
	// ARM_RELOC_PAIR.
	unsigned RType = Obj->getAnyRelocationType(RENext);
	if (RType != MachO::ARM_RELOC_PAIR)
	report_error(Obj->getFileName(), "Expected ARM_RELOC_PAIR after "
	"ARM_RELOC_HALF");

	// NOTE: The half of the target virtual address is stashed in the
	// address field of the secondary relocation, but we can't reverse
	// engineer the constant offset from it without decoding the movw/movt
	// instruction to find the other half in its immediate field.

	// ARM_RELOC_HALF_SECTDIFF encodes the second section in the
	// symbol/section pointer of the follow-on relocation.
	if (Type == MachO::ARM_RELOC_HALF_SECTDIFF) {
	fmt << "-";
	printRelocationTargetName(Obj, RENext, fmt);
	}

	fmt << ")";
	break;
	}
	default: { printRelocationTargetName(Obj, RE, fmt); }
	}
	}
	} else
	printRelocationTargetName(Obj, RE, fmt);

	fmt.flush();
	Result.append(fmtbuf.begin(), fmtbuf.end());
	return std::error_code();
	}

	static std::error_code getRelocationValueString(const RelocationRef &Rel,
	SmallVectorImpl<char> &Result) {
	const ObjectFile *Obj = Rel.getObject();
	if (auto *ELF = dyn_cast<ELFObjectFileBase>(Obj))
	return getRelocationValueString(ELF, Rel, Result);
	if (auto *COFF = dyn_cast<COFFObjectFile>(Obj))
	return getRelocationValueString(COFF, Rel, Result);
	if (auto *Wasm = dyn_cast<WasmObjectFile>(Obj))
	return getRelocationValueString(Wasm, Rel, Result);
	if (auto *MachO = dyn_cast<MachOObjectFile>(Obj))
	return getRelocationValueString(MachO, Rel, Result);
	llvm_unreachable("unknown object file format");
	}

	/// @brief Indicates whether this relocation should hidden when listing
	/// relocations, usually because it is the trailing part of a multipart
	/// relocation that will be printed as part of the leading relocation.
	static bool getHidden(RelocationRef RelRef) {
	const ObjectFile *Obj = RelRef.getObject();
	auto *MachO = dyn_cast<MachOObjectFile>(Obj);
	if (!MachO)
	return false;

	unsigned Arch = MachO->getArch();
	DataRefImpl Rel = RelRef.getRawDataRefImpl();
	uint64_t Type = MachO->getRelocationType(Rel);

	// On arches that use the generic relocations, GENERIC_RELOC_PAIR
	// is always hidden.
	if (Arch == Triple::x86 \|\| Arch == Triple::arm \|\| Arch == Triple::ppc) {
	if (Type == MachO::GENERIC_RELOC_PAIR)
	return true;
	} else if (Arch == Triple::x86_64) {
	// On x86_64, X86_64_RELOC_UNSIGNED is hidden only when it follows
	// an X86_64_RELOC_SUBTRACTOR.
	if (Type == MachO::X86_64_RELOC_UNSIGNED && Rel.d.a > 0) {
	DataRefImpl RelPrev = Rel;
	RelPrev.d.a--;
	uint64_t PrevType = MachO->getRelocationType(RelPrev);
	if (PrevType == MachO::X86_64_RELOC_SUBTRACTOR)
	return true;
	}
	}

	return false;
	}

	static uint8_t getElfSymbolType(const ObjectFile *Obj, const SymbolRef &Sym) {
	assert(Obj->isELF());
	if (auto *Elf32LEObj = dyn_cast<ELF32LEObjectFile>(Obj))
	return Elf32LEObj->getSymbol(Sym.getRawDataRefImpl())->getType();
	if (auto *Elf64LEObj = dyn_cast<ELF64LEObjectFile>(Obj))
	return Elf64LEObj->getSymbol(Sym.getRawDataRefImpl())->getType();
	if (auto *Elf32BEObj = dyn_cast<ELF32BEObjectFile>(Obj))
	return Elf32BEObj->getSymbol(Sym.getRawDataRefImpl())->getType();
	if (auto *Elf64BEObj = cast<ELF64BEObjectFile>(Obj))
	return Elf64BEObj->getSymbol(Sym.getRawDataRefImpl())->getType();
	llvm_unreachable("Unsupported binary format");
	}

	template <class ELFT> static void
	addDynamicElfSymbols(const ELFObjectFile<ELFT> *Obj,
	std::map<SectionRef, SectionSymbolsTy> &AllSymbols) {
	for (auto Symbol : Obj->getDynamicSymbolIterators()) {
	uint8_t SymbolType = Symbol.getELFType();
	if (SymbolType != ELF::STT_FUNC \|\| Symbol.getSize() == 0)
	continue;

	Expected<uint64_t> AddressOrErr = Symbol.getAddress();
	if (!AddressOrErr)
	report_error(Obj->getFileName(), AddressOrErr.takeError());
	uint64_t Address = *AddressOrErr;

	Expected<StringRef> Name = Symbol.getName();
	if (!Name)
	report_error(Obj->getFileName(), Name.takeError());
	if (Name->empty())
	continue;

	Expected<section_iterator> SectionOrErr = Symbol.getSection();
	if (!SectionOrErr)
	report_error(Obj->getFileName(), SectionOrErr.takeError());
	section_iterator SecI = *SectionOrErr;
	if (SecI == Obj->section_end())
	continue;

	AllSymbols[SecI].emplace_back(Address, Name, SymbolType);
	}
	}

	static void
	addDynamicElfSymbols(const ObjectFile *Obj,
	std::map<SectionRef, SectionSymbolsTy> &AllSymbols) {
	assert(Obj->isELF());
	if (auto *Elf32LEObj = dyn_cast<ELF32LEObjectFile>(Obj))
	addDynamicElfSymbols(Elf32LEObj, AllSymbols);
	else if (auto *Elf64LEObj = dyn_cast<ELF64LEObjectFile>(Obj))
	addDynamicElfSymbols(Elf64LEObj, AllSymbols);
	else if (auto *Elf32BEObj = dyn_cast<ELF32BEObjectFile>(Obj))
	addDynamicElfSymbols(Elf32BEObj, AllSymbols);
	else if (auto *Elf64BEObj = cast<ELF64BEObjectFile>(Obj))
	addDynamicElfSymbols(Elf64BEObj, AllSymbols);
	else
	llvm_unreachable("Unsupported binary format");
	}

	static void DisassembleObject(const ObjectFile *Obj, bool InlineRelocs) {
	if (StartAddress > StopAddress)
	error("Start address should be less than stop address");

	const Target *TheTarget = getTarget(Obj);

	// Package up features to be passed to target/subtarget
	SubtargetFeatures Features = Obj->getFeatures();
	if (MAttrs.size()) {
	for (unsigned i = 0; i != MAttrs.size(); ++i)
	Features.AddFeature(MAttrs[i]);
	}

	std::unique_ptr<const MCRegisterInfo> MRI(
	TheTarget->createMCRegInfo(TripleName));
	if (!MRI)
	report_error(Obj->getFileName(), "no register info for target " +
	TripleName);

	// Set up disassembler.
	std::unique_ptr<const MCAsmInfo> AsmInfo(
	TheTarget->createMCAsmInfo(*MRI, TripleName));
	if (!AsmInfo)
	report_error(Obj->getFileName(), "no assembly info for target " +
	TripleName);
	std::unique_ptr<const MCSubtargetInfo> STI(
	TheTarget->createMCSubtargetInfo(TripleName, MCPU, Features.getString()));
	if (!STI)
	report_error(Obj->getFileName(), "no subtarget info for target " +
	TripleName);
	std::unique_ptr<const MCInstrInfo> MII(TheTarget->createMCInstrInfo());
	if (!MII)
	report_error(Obj->getFileName(), "no instruction info for target " +
	TripleName);
	MCObjectFileInfo MOFI;
	MCContext Ctx(AsmInfo.get(), MRI.get(), &MOFI);
	// FIXME: for now initialize MCObjectFileInfo with default values
	MOFI.InitMCObjectFileInfo(Triple(TripleName), false, CodeModel::Default, Ctx);

	std::unique_ptr<MCDisassembler> DisAsm(
	TheTarget->createMCDisassembler(*STI, Ctx));
	if (!DisAsm)
	report_error(Obj->getFileName(), "no disassembler for target " +
	TripleName);

	std::unique_ptr<const MCInstrAnalysis> MIA(
	TheTarget->createMCInstrAnalysis(MII.get()));

	int AsmPrinterVariant = AsmInfo->getAssemblerDialect();
	std::unique_ptr<MCInstPrinter> IP(TheTarget->createMCInstPrinter(
	Triple(TripleName), AsmPrinterVariant, AsmInfo, MII, *MRI));
	if (!IP)
	report_error(Obj->getFileName(), "no instruction printer for target " +
	TripleName);
	IP->setPrintImmHex(PrintImmHex);
	PrettyPrinter &PIP = selectPrettyPrinter(Triple(TripleName));

	StringRef Fmt = Obj->getBytesInAddress() > 4 ? "\t\t%016" PRIx64 ": " :
	"\t\t\t%08" PRIx64 ": ";

	SourcePrinter SP(Obj, TheTarget->getName());

	// Create a mapping, RelocSecs = SectionRelocMap[S], where sections
	// in RelocSecs contain the relocations for section S.
	std::error_code EC;
	std::map<SectionRef, SmallVector<SectionRef, 1>> SectionRelocMap;
	for (const SectionRef &Section : ToolSectionFilter(*Obj)) {
	section_iterator Sec2 = Section.getRelocatedSection();
	if (Sec2 != Obj->section_end())
	SectionRelocMap[*Sec2].push_back(Section);
	}

	// Create a mapping from virtual address to symbol name. This is used to
	// pretty print the symbols while disassembling.
	std::map<SectionRef, SectionSymbolsTy> AllSymbols;
	for (const SymbolRef &Symbol : Obj->symbols()) {
	Expected<uint64_t> AddressOrErr = Symbol.getAddress();
	if (!AddressOrErr)
	report_error(Obj->getFileName(), AddressOrErr.takeError());
	uint64_t Address = *AddressOrErr;

	Expected<StringRef> Name = Symbol.getName();
	if (!Name)
	report_error(Obj->getFileName(), Name.takeError());
	if (Name->empty())
	continue;

	Expected<section_iterator> SectionOrErr = Symbol.getSection();
	if (!SectionOrErr)
	report_error(Obj->getFileName(), SectionOrErr.takeError());
	section_iterator SecI = *SectionOrErr;
	if (SecI == Obj->section_end())
	continue;

	uint8_t SymbolType = ELF::STT_NOTYPE;
	if (Obj->isELF())
	SymbolType = getElfSymbolType(Obj, Symbol);

	AllSymbols[SecI].emplace_back(Address, Name, SymbolType);

	}
	if (AllSymbols.empty() && Obj->isELF())
	addDynamicElfSymbols(Obj, AllSymbols);

	// Create a mapping from virtual address to section.
	std::vector<std::pair<uint64_t, SectionRef>> SectionAddresses;
	for (SectionRef Sec : Obj->sections())
	SectionAddresses.emplace_back(Sec.getAddress(), Sec);
	array_pod_sort(SectionAddresses.begin(), SectionAddresses.end());

	// Linked executables (.exe and .dll files) typically don't include a real
	// symbol table but they might contain an export table.
	if (const auto *COFFObj = dyn_cast<COFFObjectFile>(Obj)) {
	for (const auto &ExportEntry : COFFObj->export_directories()) {
	StringRef Name;
	error(ExportEntry.getSymbolName(Name));
	if (Name.empty())
	continue;
	uint32_t RVA;
	error(ExportEntry.getExportRVA(RVA));

	uint64_t VA = COFFObj->getImageBase() + RVA;
	auto Sec = std::upper_bound(
	SectionAddresses.begin(), SectionAddresses.end(), VA,
	[](uint64_t LHS, const std::pair<uint64_t, SectionRef> &RHS) {
	return LHS < RHS.first;
	});
	if (Sec != SectionAddresses.begin())
	--Sec;
	else
	Sec = SectionAddresses.end();

	if (Sec != SectionAddresses.end())
	AllSymbols[Sec->second].emplace_back(VA, Name, ELF::STT_NOTYPE);
	}
	}

	// Sort all the symbols, this allows us to use a simple binary search to find
	// a symbol near an address.
	for (std::pair<const SectionRef, SectionSymbolsTy> &SecSyms : AllSymbols)
	array_pod_sort(SecSyms.second.begin(), SecSyms.second.end());

	for (const SectionRef &Section : ToolSectionFilter(*Obj)) {
	if (!DisassembleAll && (!Section.isText() \|\| Section.isVirtual()))
	continue;

	uint64_t SectionAddr = Section.getAddress();
	uint64_t SectSize = Section.getSize();
	if (!SectSize)
	continue;

	// Get the list of all the symbols in this section.
	SectionSymbolsTy &Symbols = AllSymbols[Section];
	std::vector<uint64_t> DataMappingSymsAddr;
	std::vector<uint64_t> TextMappingSymsAddr;
	if (isArmElf(Obj)) {
	for (const auto &Symb : Symbols) {
	uint64_t Address = std::get<0>(Symb);
	StringRef Name = std::get<1>(Symb);
	if (Name.startswith("$d"))
	DataMappingSymsAddr.push_back(Address - SectionAddr);
	if (Name.startswith("$x"))
	TextMappingSymsAddr.push_back(Address - SectionAddr);
	if (Name.startswith("$a"))
	TextMappingSymsAddr.push_back(Address - SectionAddr);
	if (Name.startswith("$t"))
	TextMappingSymsAddr.push_back(Address - SectionAddr);
	}
	}

	std::sort(DataMappingSymsAddr.begin(), DataMappingSymsAddr.end());
	std::sort(TextMappingSymsAddr.begin(), TextMappingSymsAddr.end());

	if (Obj->isELF() && Obj->getArch() == Triple::amdgcn) {
	// AMDGPU disassembler uses symbolizer for printing labels
	std::unique_ptr<MCRelocationInfo> RelInfo(
	TheTarget->createMCRelocationInfo(TripleName, Ctx));
	if (RelInfo) {
	std::unique_ptr<MCSymbolizer> Symbolizer(
	TheTarget->createMCSymbolizer(
	TripleName, nullptr, nullptr, &Symbols, &Ctx, std::move(RelInfo)));
	DisAsm->setSymbolizer(std::move(Symbolizer));
	}
	}

	// Make a list of all the relocations for this section.
	std::vector<RelocationRef> Rels;
	if (InlineRelocs) {
	for (const SectionRef &RelocSec : SectionRelocMap[Section]) {
	for (const RelocationRef &Reloc : RelocSec.relocations()) {
	Rels.push_back(Reloc);
	}
	}
	}

	// Sort relocations by address.
	std::sort(Rels.begin(), Rels.end(), RelocAddressLess);

	StringRef SegmentName = "";
	if (const MachOObjectFile *MachO = dyn_cast<const MachOObjectFile>(Obj)) {
	DataRefImpl DR = Section.getRawDataRefImpl();
	SegmentName = MachO->getSectionFinalSegmentName(DR);
	}
	StringRef name;
	error(Section.getName(name));

	if ((SectionAddr <= StopAddress) &&
	(SectionAddr + SectSize) >= StartAddress) {
	outs() << "Disassembly of section ";
	if (!SegmentName.empty())
	outs() << SegmentName << ",";
	outs() << name << ':';
	}

	// If the section has no symbol at the start, just insert a dummy one.
	if (Symbols.empty() \|\| std::get<0>(Symbols[0]) != 0) {
	Symbols.insert(Symbols.begin(),
	std::make_tuple(SectionAddr, name, Section.isText()
	? ELF::STT_FUNC
	: ELF::STT_OBJECT));
	}

	SmallString<40> Comments;
	raw_svector_ostream CommentStream(Comments);

	StringRef BytesStr;
	error(Section.getContents(BytesStr));
	ArrayRef<uint8_t> Bytes(reinterpret_cast<const uint8_t *>(BytesStr.data()),
	BytesStr.size());

	uint64_t Size;
	uint64_t Index;

	std::vector<RelocationRef>::const_iterator rel_cur = Rels.begin();
	std::vector<RelocationRef>::const_iterator rel_end = Rels.end();
	// Disassemble symbol by symbol.
	for (unsigned si = 0, se = Symbols.size(); si != se; ++si) {
	uint64_t Start = std::get<0>(Symbols[si]) - SectionAddr;
	// The end is either the section end or the beginning of the next
	// symbol.
	uint64_t End =
	(si == se - 1) ? SectSize : std::get<0>(Symbols[si + 1]) - SectionAddr;
	// Don't try to disassemble beyond the end of section contents.
	if (End > SectSize)
	End = SectSize;
	// If this symbol has the same address as the next symbol, then skip it.
	if (Start >= End)
	continue;

	// Check if we need to skip symbol
	// Skip if the symbol's data is not between StartAddress and StopAddress
	if (End + SectionAddr < StartAddress \|\|
	Start + SectionAddr > StopAddress) {
	continue;
	}

	// Stop disassembly at the stop address specified
	if (End + SectionAddr > StopAddress)
	End = StopAddress - SectionAddr;

	if (Obj->isELF() && Obj->getArch() == Triple::amdgcn) {
	// make size 4 bytes folded
	End = Start + ((End - Start) & ~0x3ull);
	if (std::get<2>(Symbols[si]) == ELF::STT_AMDGPU_HSA_KERNEL) {
	// skip amd_kernel_code_t at the begining of kernel symbol (256 bytes)
	Start += 256;
	}
	if (si == se - 1 \|\|
	std::get<2>(Symbols[si + 1]) == ELF::STT_AMDGPU_HSA_KERNEL) {
	// cut trailing zeroes at the end of kernel
	// cut up to 256 bytes
	const uint64_t EndAlign = 256;
	const auto Limit = End - (std::min)(EndAlign, End - Start);
	while (End > Limit &&
	reinterpret_cast<const support::ulittle32_t>(&Bytes[End - 4]) == 0)
	End -= 4;
	}
	}

	outs() << '\n' << std::get<1>(Symbols[si]) << ":\n";

	#ifndef NDEBUG
	raw_ostream &DebugOut = DebugFlag ? dbgs() : nulls();
	#else
	raw_ostream &DebugOut = nulls();
	#endif

	for (Index = Start; Index < End; Index += Size) {
	MCInst Inst;

	if (Index + SectionAddr < StartAddress \|\|
	Index + SectionAddr > StopAddress) {
	// skip byte by byte till StartAddress is reached
	Size = 1;
	continue;
	}
	// AArch64 ELF binaries can interleave data and text in the
	// same section. We rely on the markers introduced to
	// understand what we need to dump. If the data marker is within a
	// function, it is denoted as a word/short etc
	if (isArmElf(Obj) && std::get<2>(Symbols[si]) != ELF::STT_OBJECT &&
	!DisassembleAll) {
	uint64_t Stride = 0;

	auto DAI = std::lower_bound(DataMappingSymsAddr.begin(),
	DataMappingSymsAddr.end(), Index);
	if (DAI != DataMappingSymsAddr.end() && *DAI == Index) {
	// Switch to data.
	while (Index < End) {
	outs() << format("%8" PRIx64 ":", SectionAddr + Index);
	outs() << "\t";
	if (Index + 4 <= End) {
	Stride = 4;
	dumpBytes(Bytes.slice(Index, 4), outs());
	outs() << "\t.word\t";
	uint32_t Data = 0;
	if (Obj->isLittleEndian()) {
	const auto Word =
	reinterpret_cast<const support::ulittle32_t *>(
	Bytes.data() + Index);
	Data = *Word;
	} else {
	const auto Word = reinterpret_cast<const support::ubig32_t *>(
	Bytes.data() + Index);
	Data = *Word;
	}
	outs() << "0x" << format("%08" PRIx32, Data);
	} else if (Index + 2 <= End) {
	Stride = 2;
	dumpBytes(Bytes.slice(Index, 2), outs());
	outs() << "\t\t.short\t";
	uint16_t Data = 0;
	if (Obj->isLittleEndian()) {
	const auto Short =
	reinterpret_cast<const support::ulittle16_t *>(
	Bytes.data() + Index);
	Data = *Short;
	} else {
	const auto Short =
	reinterpret_cast<const support::ubig16_t *>(Bytes.data() +
	Index);
	Data = *Short;
	}
	outs() << "0x" << format("%04" PRIx16, Data);
	} else {
	Stride = 1;
	dumpBytes(Bytes.slice(Index, 1), outs());
	outs() << "\t\t.byte\t";
	outs() << "0x" << format("%02" PRIx8, Bytes.slice(Index, 1)[0]);
	}
	Index += Stride;
	outs() << "\n";
	auto TAI = std::lower_bound(TextMappingSymsAddr.begin(),
	TextMappingSymsAddr.end(), Index);
	if (TAI != TextMappingSymsAddr.end() && *TAI == Index)
	break;
	}
	}
	}

	// If there is a data symbol inside an ELF text section and we are only
	// disassembling text (applicable all architectures),
	// we are in a situation where we must print the data and not
	// disassemble it.
	if (Obj->isELF() && std::get<2>(Symbols[si]) == ELF::STT_OBJECT &&
	!DisassembleAll && Section.isText()) {
	// print out data up to 8 bytes at a time in hex and ascii
	uint8_t AsciiData[9] = {'\0'};
	uint8_t Byte;
	int NumBytes = 0;

	for (Index = Start; Index < End; Index += 1) {
	if (((SectionAddr + Index) < StartAddress) \|\|
	((SectionAddr + Index) > StopAddress))
	continue;
	if (NumBytes == 0) {
	outs() << format("%8" PRIx64 ":", SectionAddr + Index);
	outs() << "\t";
	}
	Byte = Bytes.slice(Index)[0];
	outs() << format(" %02x", Byte);
	AsciiData[NumBytes] = isprint(Byte) ? Byte : '.';

	uint8_t IndentOffset = 0;
	NumBytes++;
	if (Index == End - 1 \|\| NumBytes > 8) {
	// Indent the space for less than 8 bytes data.
	// 2 spaces for byte and one for space between bytes
	IndentOffset = 3 * (8 - NumBytes);
	for (int Excess = 8 - NumBytes; Excess < 8; Excess++)
	AsciiData[Excess] = '\0';
	NumBytes = 8;
	}
	if (NumBytes == 8) {
	AsciiData[8] = '\0';
	outs() << std::string(IndentOffset, ' ') << " ";
	outs() << reinterpret_cast<char *>(AsciiData);
	outs() << '\n';
	NumBytes = 0;
	}
	}
	}
	if (Index >= End)
	break;

	// Disassemble a real instruction or a data when disassemble all is
	// provided
	bool Disassembled = DisAsm->getInstruction(Inst, Size, Bytes.slice(Index),
	SectionAddr + Index, DebugOut,
	CommentStream);
	if (Size == 0)
	Size = 1;

	PIP.printInst(*IP, Disassembled ? &Inst : nullptr,
	Bytes.slice(Index, Size), SectionAddr + Index, outs(), "",
	*STI, &SP);
	outs() << CommentStream.str();
	Comments.clear();

	// Try to resolve the target of a call, tail call, etc. to a specific
	// symbol.
	if (MIA && (MIA->isCall(Inst) \|\| MIA->isUnconditionalBranch(Inst) \|\|
	MIA->isConditionalBranch(Inst))) {
	uint64_t Target;
	if (MIA->evaluateBranch(Inst, SectionAddr + Index, Size, Target)) {
	// In a relocatable object, the target's section must reside in
	// the same section as the call instruction or it is accessed
	// through a relocation.
	//
	// In a non-relocatable object, the target may be in any section.
	//
	// N.B. We don't walk the relocations in the relocatable case yet.
	auto *TargetSectionSymbols = &Symbols;
	if (!Obj->isRelocatableObject()) {
	auto SectionAddress = std::upper_bound(
	SectionAddresses.begin(), SectionAddresses.end(), Target,
	[](uint64_t LHS,
	const std::pair<uint64_t, SectionRef> &RHS) {
	return LHS < RHS.first;
	});
	if (SectionAddress != SectionAddresses.begin()) {
	--SectionAddress;
	TargetSectionSymbols = &AllSymbols[SectionAddress->second];
	} else {
	TargetSectionSymbols = nullptr;
	}
	}

	// Find the first symbol in the section whose offset is less than
	// or equal to the target.
	if (TargetSectionSymbols) {
	auto TargetSym = std::upper_bound(
	TargetSectionSymbols->begin(), TargetSectionSymbols->end(),
	Target, [](uint64_t LHS,
	const std::tuple<uint64_t, StringRef, uint8_t> &RHS) {
	return LHS < std::get<0>(RHS);
	});
	if (TargetSym != TargetSectionSymbols->begin()) {
	--TargetSym;
	uint64_t TargetAddress = std::get<0>(*TargetSym);
	StringRef TargetName = std::get<1>(*TargetSym);
	outs() << " <" << TargetName;
	uint64_t Disp = Target - TargetAddress;
	if (Disp)
	outs() << "+0x" << utohexstr(Disp);
	outs() << '>';
	}
	}
	}
	}
	outs() << "\n";

	// Print relocation for instruction.
	while (rel_cur != rel_end) {
	bool hidden = getHidden(*rel_cur);
	uint64_t addr = rel_cur->getOffset();
	SmallString<16> name;
	SmallString<32> val;

	// If this relocation is hidden, skip it.
	if (hidden \|\| ((SectionAddr + addr) < StartAddress)) {
	++rel_cur;
	continue;
	}

	// Stop when rel_cur's address is past the current instruction.
	if (addr >= Index + Size) break;
	rel_cur->getTypeName(name);
	error(getRelocationValueString(*rel_cur, val));
	outs() << format(Fmt.data(), SectionAddr + addr) << name
	<< "\t" << val << "\n";
	++rel_cur;
	}
	}
	}
	}
	}

	void llvm::PrintRelocations(const ObjectFile *Obj) {
	StringRef Fmt = Obj->getBytesInAddress() > 4 ? "%016" PRIx64 :
	"%08" PRIx64;
	// Regular objdump doesn't print relocations in non-relocatable object
	// files.
	if (!Obj->isRelocatableObject())
	return;

	for (const SectionRef &Section : ToolSectionFilter(*Obj)) {
	if (Section.relocation_begin() == Section.relocation_end())
	continue;
	StringRef secname;
	error(Section.getName(secname));
	outs() << "RELOCATION RECORDS FOR [" << secname << "]:\n";
	for (const RelocationRef &Reloc : Section.relocations()) {
	bool hidden = getHidden(Reloc);
	uint64_t address = Reloc.getOffset();
	SmallString<32> relocname;
	SmallString<32> valuestr;
	if (address < StartAddress \|\| address > StopAddress \|\| hidden)
	continue;
	Reloc.getTypeName(relocname);
	error(getRelocationValueString(Reloc, valuestr));
	outs() << format(Fmt.data(), address) << " " << relocname << " "
	<< valuestr << "\n";
	}
	outs() << "\n";
	}
	}

	void llvm::PrintSectionHeaders(const ObjectFile *Obj) {
	outs() << "Sections:\n"
	"Idx Name Size Address Type\n";
	unsigned i = 0;
	for (const SectionRef &Section : ToolSectionFilter(*Obj)) {
	StringRef Name;
	error(Section.getName(Name));
	uint64_t Address = Section.getAddress();
	uint64_t Size = Section.getSize();
	bool Text = Section.isText();
	bool Data = Section.isData();
	bool BSS = Section.isBSS();
	std::string Type = (std::string(Text ? "TEXT " : "") +
	(Data ? "DATA " : "") + (BSS ? "BSS" : ""));
	outs() << format("%3d %-13s %08" PRIx64 " %016" PRIx64 " %s\n", i,
	Name.str().c_str(), Size, Address, Type.c_str());
	++i;
	}
	}

	void llvm::PrintSectionContents(const ObjectFile *Obj) {
	std::error_code EC;
	for (const SectionRef &Section : ToolSectionFilter(*Obj)) {
	StringRef Name;
	StringRef Contents;
	error(Section.getName(Name));
	uint64_t BaseAddr = Section.getAddress();
	uint64_t Size = Section.getSize();
	if (!Size)
	continue;

	outs() << "Contents of section " << Name << ":\n";
	if (Section.isBSS()) {
	outs() << format("<skipping contents of bss section at [%04" PRIx64
	", %04" PRIx64 ")>\n",
	BaseAddr, BaseAddr + Size);
	continue;
	}

	error(Section.getContents(Contents));

	// Dump out the content as hex and printable ascii characters.
	for (std::size_t addr = 0, end = Contents.size(); addr < end; addr += 16) {
	outs() << format(" %04" PRIx64 " ", BaseAddr + addr);
	// Dump line of hex.
	for (std::size_t i = 0; i < 16; ++i) {
	if (i != 0 && i % 4 == 0)
	outs() << ' ';
	if (addr + i < end)
	outs() << hexdigit((Contents[addr + i] >> 4) & 0xF, true)
	<< hexdigit(Contents[addr + i] & 0xF, true);
	else
	outs() << " ";
	}
	// Print ascii.
	outs() << " ";
	for (std::size_t i = 0; i < 16 && addr + i < end; ++i) {
	if (std::isprint(static_cast<unsigned char>(Contents[addr + i]) & 0xFF))
	outs() << Contents[addr + i];
	else
	outs() << ".";
	}
	outs() << "\n";
	}
	}
	}

	void llvm::PrintSymbolTable(const ObjectFile *o, StringRef ArchiveName,
	StringRef ArchitectureName) {
	outs() << "SYMBOL TABLE:\n";

	if (const COFFObjectFile *coff = dyn_cast<const COFFObjectFile>(o)) {
	printCOFFSymbolTable(coff);
	return;
	}
	for (const SymbolRef &Symbol : o->symbols()) {
	Expected<uint64_t> AddressOrError = Symbol.getAddress();
	if (!AddressOrError)
	report_error(ArchiveName, o->getFileName(), AddressOrError.takeError(),
	ArchitectureName);
	uint64_t Address = *AddressOrError;
	if ((Address < StartAddress) \|\| (Address > StopAddress))
	continue;
	Expected<SymbolRef::Type> TypeOrError = Symbol.getType();
	if (!TypeOrError)
	report_error(ArchiveName, o->getFileName(), TypeOrError.takeError(),
	ArchitectureName);
	SymbolRef::Type Type = *TypeOrError;
	uint32_t Flags = Symbol.getFlags();
	Expected<section_iterator> SectionOrErr = Symbol.getSection();
	if (!SectionOrErr)
	report_error(ArchiveName, o->getFileName(), SectionOrErr.takeError(),
	ArchitectureName);
	section_iterator Section = *SectionOrErr;
	StringRef Name;
	if (Type == SymbolRef::ST_Debug && Section != o->section_end()) {
	Section->getName(Name);
	} else {
	Expected<StringRef> NameOrErr = Symbol.getName();
	if (!NameOrErr)
	report_error(ArchiveName, o->getFileName(), NameOrErr.takeError(),
	ArchitectureName);
	Name = *NameOrErr;
	}

	bool Global = Flags & SymbolRef::SF_Global;
	bool Weak = Flags & SymbolRef::SF_Weak;
	bool Absolute = Flags & SymbolRef::SF_Absolute;
	bool Common = Flags & SymbolRef::SF_Common;
	bool Hidden = Flags & SymbolRef::SF_Hidden;

	char GlobLoc = ' ';
	if (Type != SymbolRef::ST_Unknown)
	GlobLoc = Global ? 'g' : 'l';
	char Debug = (Type == SymbolRef::ST_Debug \|\| Type == SymbolRef::ST_File)
	? 'd' : ' ';
	char FileFunc = ' ';
	if (Type == SymbolRef::ST_File)
	FileFunc = 'f';
	else if (Type == SymbolRef::ST_Function)
	FileFunc = 'F';

	const char *Fmt = o->getBytesInAddress() > 4 ? "%016" PRIx64 :
	"%08" PRIx64;

	outs() << format(Fmt, Address) << " "
	<< GlobLoc // Local -> 'l', Global -> 'g', Neither -> ' '
	<< (Weak ? 'w' : ' ') // Weak?
	<< ' ' // Constructor. Not supported yet.
	<< ' ' // Warning. Not supported yet.
	<< ' ' // Indirect reference to another symbol.
	<< Debug // Debugging (d) or dynamic (D) symbol.
	<< FileFunc // Name of function (F), file (f) or object (O).
	<< ' ';
	if (Absolute) {
	outs() << "ABS";
	} else if (Common) {
	outs() << "COM";
	} else if (Section == o->section_end()) {
	outs() << "UND";
	} else {
	if (const MachOObjectFile *MachO =
	dyn_cast<const MachOObjectFile>(o)) {
	DataRefImpl DR = Section->getRawDataRefImpl();
	StringRef SegmentName = MachO->getSectionFinalSegmentName(DR);
	outs() << SegmentName << ",";
	}
	StringRef SectionName;
	error(Section->getName(SectionName));
	outs() << SectionName;
	}

	outs() << '\t';
	if (Common \|\| isa<ELFObjectFileBase>(o)) {
	uint64_t Val =
	Common ? Symbol.getAlignment() : ELFSymbolRef(Symbol).getSize();
	outs() << format("\t %08" PRIx64 " ", Val);
	}

	if (Hidden) {
	outs() << ".hidden ";
	}
	outs() << Name
	<< '\n';
	}
	}

	static void PrintUnwindInfo(const ObjectFile *o) {
	outs() << "Unwind info:\n\n";

	if (const COFFObjectFile *coff = dyn_cast<COFFObjectFile>(o)) {
	printCOFFUnwindInfo(coff);
	} else if (const MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(o))
	printMachOUnwindInfo(MachO);
	else {
	// TODO: Extract DWARF dump tool to objdump.
	errs() << "This operation is only currently supported "
	"for COFF and MachO object files.\n";
	return;
	}
	}

	void llvm::printExportsTrie(const ObjectFile *o) {
	outs() << "Exports trie:\n";
	if (const MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(o))
	printMachOExportsTrie(MachO);
	else {
	errs() << "This operation is only currently supported "
	"for Mach-O executable files.\n";
	return;
	}
	}

	void llvm::printRebaseTable(ObjectFile *o) {
	outs() << "Rebase table:\n";
	if (MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(o))
	printMachORebaseTable(MachO);
	else {
	errs() << "This operation is only currently supported "
	"for Mach-O executable files.\n";
	return;
	}
	}

	void llvm::printBindTable(ObjectFile *o) {
	outs() << "Bind table:\n";
	if (MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(o))
	printMachOBindTable(MachO);
	else {
	errs() << "This operation is only currently supported "
	"for Mach-O executable files.\n";
	return;
	}
	}

	void llvm::printLazyBindTable(ObjectFile *o) {
	outs() << "Lazy bind table:\n";
	if (MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(o))
	printMachOLazyBindTable(MachO);
	else {
	errs() << "This operation is only currently supported "
	"for Mach-O executable files.\n";
	return;
	}
	}

	void llvm::printWeakBindTable(ObjectFile *o) {
	outs() << "Weak bind table:\n";
	if (MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(o))
	printMachOWeakBindTable(MachO);
	else {
	errs() << "This operation is only currently supported "
	"for Mach-O executable files.\n";
	return;
	}
	}

	/// Dump the raw contents of the __clangast section so the output can be piped
	/// into llvm-bcanalyzer.
	void llvm::printRawClangAST(const ObjectFile *Obj) {
	if (outs().is_displayed()) {
	errs() << "The -raw-clang-ast option will dump the raw binary contents of "
	"the clang ast section.\n"
	"Please redirect the output to a file or another program such as "
	"llvm-bcanalyzer.\n";
	return;
	}

	StringRef ClangASTSectionName("__clangast");
	if (isa<COFFObjectFile>(Obj)) {
	ClangASTSectionName = "clangast";
	}

	Optional<object::SectionRef> ClangASTSection;
	for (auto Sec : ToolSectionFilter(*Obj)) {
	StringRef Name;
	Sec.getName(Name);
	if (Name == ClangASTSectionName) {
	ClangASTSection = Sec;
	break;
	}
	}
	if (!ClangASTSection)
	return;

	StringRef ClangASTContents;
	error(ClangASTSection.getValue().getContents(ClangASTContents));
	outs().write(ClangASTContents.data(), ClangASTContents.size());
	}

	static void printFaultMaps(const ObjectFile *Obj) {
	const char *FaultMapSectionName = nullptr;

	if (isa<ELFObjectFileBase>(Obj)) {
	FaultMapSectionName = ".llvm_faultmaps";
	} else if (isa<MachOObjectFile>(Obj)) {
	FaultMapSectionName = "__llvm_faultmaps";
	} else {
	errs() << "This operation is only currently supported "
	"for ELF and Mach-O executable files.\n";
	return;
	}

	Optional<object::SectionRef> FaultMapSection;

	for (auto Sec : ToolSectionFilter(*Obj)) {
	StringRef Name;
	Sec.getName(Name);
	if (Name == FaultMapSectionName) {
	FaultMapSection = Sec;
	break;
	}
	}

	outs() << "FaultMap table:\n";

	if (!FaultMapSection.hasValue()) {
	outs() << "<not found>\n";
	return;
	}

	StringRef FaultMapContents;
	error(FaultMapSection.getValue().getContents(FaultMapContents));

	FaultMapParser FMP(FaultMapContents.bytes_begin(),
	FaultMapContents.bytes_end());

	outs() << FMP;
	}

	static void printPrivateFileHeaders(const ObjectFile *o, bool onlyFirst) {
	if (o->isELF())
	return printELFFileHeader(o);
	if (o->isCOFF())
	return printCOFFFileHeader(o);
	if (o->isWasm())
	return printWasmFileHeader(o);
	if (o->isMachO()) {
	printMachOFileHeader(o);
	if (!onlyFirst)
	printMachOLoadCommands(o);
	return;
	}
	report_error(o->getFileName(), "Invalid/Unsupported object file format");
	}

	static void DumpObject(ObjectFile o, const Archive a = nullptr) {
	StringRef ArchiveName = a != nullptr ? a->getFileName() : "";
	// Avoid other output when using a raw option.
	if (!RawClangAST) {
	outs() << '\n';
	if (a)
	outs() << a->getFileName() << "(" << o->getFileName() << ")";
	else
	outs() << o->getFileName();
	outs() << ":\tfile format " << o->getFileFormatName() << "\n\n";
	}

	if (Disassemble)
	DisassembleObject(o, Relocations);
	if (Relocations && !Disassemble)
	PrintRelocations(o);
	if (SectionHeaders)
	PrintSectionHeaders(o);
	if (SectionContents)
	PrintSectionContents(o);
	if (SymbolTable)
	PrintSymbolTable(o, ArchiveName);
	if (UnwindInfo)
	PrintUnwindInfo(o);
	if (PrivateHeaders \|\| FirstPrivateHeader)
	printPrivateFileHeaders(o, FirstPrivateHeader);
	if (ExportsTrie)
	printExportsTrie(o);
	if (Rebase)
	printRebaseTable(o);
	if (Bind)
	printBindTable(o);
	if (LazyBind)
	printLazyBindTable(o);
	if (WeakBind)
	printWeakBindTable(o);
	if (RawClangAST)
	printRawClangAST(o);
	if (PrintFaultMaps)
	printFaultMaps(o);
	if (DwarfDumpType != DIDT_Null) {
	std::unique_ptr<DIContext> DICtx(new DWARFContextInMemory(*o));
	// Dump the complete DWARF structure.
	DIDumpOptions DumpOpts;
	DumpOpts.DumpType = DwarfDumpType;
	DumpOpts.DumpEH = true;
	DICtx->dump(outs(), DumpOpts);
	}
	}

	static void DumpObject(const COFFImportFile I, const Archive A) {
	StringRef ArchiveName = A ? A->getFileName() : "";

	// Avoid other output when using a raw option.
	if (!RawClangAST)
	outs() << '\n'
	<< ArchiveName << "(" << I->getFileName() << ")"
	<< ":\tfile format COFF-import-file"
	<< "\n\n";

	if (SymbolTable)
	printCOFFSymbolTable(I);
	}

	/// @brief Dump each object file in \a a;
	static void DumpArchive(const Archive *a) {
	Error Err = Error::success();
	for (auto &C : a->children(Err)) {
	Expected<std::unique_ptr<Binary>> ChildOrErr = C.getAsBinary();
	if (!ChildOrErr) {
	if (auto E = isNotObjectErrorInvalidFileType(ChildOrErr.takeError()))
	report_error(a->getFileName(), C, std::move(E));
	continue;
	}
	if (ObjectFile o = dyn_cast<ObjectFile>(&ChildOrErr.get()))
	DumpObject(o, a);
	else if (COFFImportFile I = dyn_cast<COFFImportFile>(&ChildOrErr.get()))
	DumpObject(I, a);
	else
	report_error(a->getFileName(), object_error::invalid_file_type);
	}
	if (Err)
	report_error(a->getFileName(), std::move(Err));
	}

	/// @brief Open file and figure out how to dump it.
	static void DumpInput(StringRef file) {

	// If we are using the Mach-O specific object file parser, then let it parse
	// the file and process the command line options. So the -arch flags can
	// be used to select specific slices, etc.
	if (MachOOpt) {
	ParseInputMachO(file);
	return;
	}

	// Attempt to open the binary.
	Expected<OwningBinary<Binary>> BinaryOrErr = createBinary(file);
	if (!BinaryOrErr)
	report_error(file, BinaryOrErr.takeError());
	Binary &Binary = *BinaryOrErr.get().getBinary();

	if (Archive *a = dyn_cast<Archive>(&Binary))
	DumpArchive(a);
	else if (ObjectFile *o = dyn_cast<ObjectFile>(&Binary))
	DumpObject(o);
	else
	report_error(file, object_error::invalid_file_type);
	}

	int main(int argc, char **argv) {
	// Print a stack trace if we signal out.
	sys::PrintStackTraceOnErrorSignal(argv[0]);
	PrettyStackTraceProgram X(argc, argv);
	llvm_shutdown_obj Y; // Call llvm_shutdown() on exit.

	// Initialize targets and assembly printers/parsers.
	llvm::InitializeAllTargetInfos();
	llvm::InitializeAllTargetMCs();
	llvm::InitializeAllDisassemblers();

	// Register the target printer for --version.
	cl::AddExtraVersionPrinter(TargetRegistry::printRegisteredTargetsForVersion);

	cl::ParseCommandLineOptions(argc, argv, "llvm object file dumper\n");
	TripleName = Triple::normalize(TripleName);

	ToolName = argv[0];

	// Defaults to a.out if no filenames specified.
	if (InputFilenames.size() == 0)
	InputFilenames.push_back("a.out");

	if (DisassembleAll \|\| PrintSource \|\| PrintLines)
	Disassemble = true;
	if (!Disassemble
	&& !Relocations
	&& !SectionHeaders
	&& !SectionContents
	&& !SymbolTable
	&& !UnwindInfo
	&& !PrivateHeaders
	&& !FirstPrivateHeader
	&& !ExportsTrie
	&& !Rebase
	&& !Bind
	&& !LazyBind
	&& !WeakBind
	&& !RawClangAST
	&& !(UniversalHeaders && MachOOpt)
	&& !(ArchiveHeaders && MachOOpt)
	&& !(IndirectSymbols && MachOOpt)
	&& !(DataInCode && MachOOpt)
	&& !(LinkOptHints && MachOOpt)
	&& !(InfoPlist && MachOOpt)
	&& !(DylibsUsed && MachOOpt)
	&& !(DylibId && MachOOpt)
	&& !(ObjcMetaData && MachOOpt)
	&& !(FilterSections.size() != 0 && MachOOpt)
	&& !PrintFaultMaps
	&& DwarfDumpType == DIDT_Null) {
	cl::PrintHelpMessage();
	return 2;
	}

	std::for_each(InputFilenames.begin(), InputFilenames.end(),
	DumpInput);

	return EXIT_SUCCESS;
	}
	diff --git a/utils/lit/lit/LitConfig.py b/utils/lit/lit/LitConfig.py
	index 2ef0a8f77ec9..3351ebed54bd 100644
	--- a/utils/lit/lit/LitConfig.py
	+++ b/utils/lit/lit/LitConfig.py
	@@ -1,159 +1,159 @@
	from __future__ import absolute_import
	import inspect
	import os
	import sys

	import lit.Test
	import lit.formats
	import lit.TestingConfig
	import lit.util

	# LitConfig must be a new style class for properties to work
	class LitConfig(object):
	"""LitConfig - Configuration data for a 'lit' test runner instance, shared
	across all tests.

	The LitConfig object is also used to communicate with client configuration
	files, it is always passed in as the global variable 'lit' so that
	configuration files can access common functionality and internal components
	easily.
	"""

	def __init__(self, progname, path, quiet,
	useValgrind, valgrindLeakCheck, valgrindArgs,
	noExecute, debug, isWindows,
	params, config_prefix = None,
	maxIndividualTestTime = 0,
	maxFailures = None,
	- parallelism_groups = [],
	+ parallelism_groups = {},
	echo_all_commands = False):
	# The name of the test runner.
	self.progname = progname
	# The items to add to the PATH environment variable.
	self.path = [str(p) for p in path]
	self.quiet = bool(quiet)
	self.useValgrind = bool(useValgrind)
	self.valgrindLeakCheck = bool(valgrindLeakCheck)
	self.valgrindUserArgs = list(valgrindArgs)
	self.noExecute = noExecute
	self.debug = debug
	self.isWindows = bool(isWindows)
	self.params = dict(params)
	self.bashPath = None

	# Configuration files to look for when discovering test suites.
	self.config_prefix = config_prefix or 'lit'
	self.config_name = '%s.cfg' % (self.config_prefix,)
	self.site_config_name = '%s.site.cfg' % (self.config_prefix,)
	self.local_config_name = '%s.local.cfg' % (self.config_prefix,)

	self.numErrors = 0
	self.numWarnings = 0

	self.valgrindArgs = []
	if self.useValgrind:
	self.valgrindArgs = ['valgrind', '-q', '--run-libc-freeres=no',
	'--tool=memcheck', '--trace-children=yes',
	'--error-exitcode=123']
	if self.valgrindLeakCheck:
	self.valgrindArgs.append('--leak-check=full')
	else:
	# The default is 'summary'.
	self.valgrindArgs.append('--leak-check=no')
	self.valgrindArgs.extend(self.valgrindUserArgs)

	self.maxIndividualTestTime = maxIndividualTestTime
	self.maxFailures = maxFailures
	self.parallelism_groups = parallelism_groups
	self.echo_all_commands = echo_all_commands

	@property
	def maxIndividualTestTime(self):
	"""
	Interface for getting maximum time to spend executing
	a single test
	"""
	return self._maxIndividualTestTime

	@maxIndividualTestTime.setter
	def maxIndividualTestTime(self, value):
	"""
	Interface for setting maximum time to spend executing
	a single test
	"""
	self._maxIndividualTestTime = value
	if self.maxIndividualTestTime > 0:
	# The current implementation needs psutil to set
	# a timeout per test. Check it's available.
	# See lit.util.killProcessAndChildren()
	try:
	import psutil # noqa: F401
	except ImportError:
	self.fatal("Setting a timeout per test requires the"
	" Python psutil module but it could not be"
	" found. Try installing it via pip or via"
	" your operating system's package manager.")
	elif self.maxIndividualTestTime < 0:
	self.fatal('The timeout per test must be >= 0 seconds')

	def load_config(self, config, path):
	"""load_config(config, path) - Load a config object from an alternate
	path."""
	if self.debug:
	self.note('load_config from %r' % path)
	config.load_from_path(path, self)
	return config

	def getBashPath(self):
	"""getBashPath - Get the path to 'bash'"""
	if self.bashPath is not None:
	return self.bashPath

	self.bashPath = lit.util.which('bash', os.pathsep.join(self.path))
	if self.bashPath is None:
	self.bashPath = lit.util.which('bash')

	if self.bashPath is None:
	self.bashPath = ''

	return self.bashPath

	def getToolsPath(self, dir, paths, tools):
	if dir is not None and os.path.isabs(dir) and os.path.isdir(dir):
	if not lit.util.checkToolsPath(dir, tools):
	return None
	else:
	dir = lit.util.whichTools(tools, paths)

	# bash
	self.bashPath = lit.util.which('bash', dir)
	if self.bashPath is None:
	self.bashPath = ''

	return dir

	def _write_message(self, kind, message):
	# Get the file/line where this message was generated.
	f = inspect.currentframe()
	# Step out of _write_message, and then out of wrapper.
	f = f.f_back.f_back
	file,line,_,_,_ = inspect.getframeinfo(f)
	location = '%s:%d' % (file, line)

	sys.stderr.write('%s: %s: %s: %s\n' % (self.progname, location,
	kind, message))

	def note(self, message):
	self._write_message('note', message)

	def warning(self, message):
	self._write_message('warning', message)
	self.numWarnings += 1

	def error(self, message):
	self._write_message('error', message)
	self.numErrors += 1

	def fatal(self, message):
	self._write_message('fatal', message)
	sys.exit(2)
	diff --git a/utils/lit/lit/TestRunner.py b/utils/lit/lit/TestRunner.py
	index 46bcac4b306e..a60a0f854870 100644
	--- a/utils/lit/lit/TestRunner.py
	+++ b/utils/lit/lit/TestRunner.py
	@@ -1,1188 +1,1188 @@
	from __future__ import absolute_import
	import os, signal, subprocess, sys
	import re
	import platform
	import tempfile
	import threading

	try:
	from StringIO import StringIO
	except ImportError:
	from io import StringIO

	from lit.ShCommands import GlobItem
	import lit.ShUtil as ShUtil
	import lit.Test as Test
	import lit.util
	from lit.util import to_bytes, to_string
	from lit.BooleanExpression import BooleanExpression

	class InternalShellError(Exception):
	def __init__(self, command, message):
	self.command = command
	self.message = message

	kIsWindows = platform.system() == 'Windows'

	# Don't use close_fds on Windows.
	kUseCloseFDs = not kIsWindows

	# Use temporary files to replace /dev/null on Windows.
	kAvoidDevNull = kIsWindows

	class ShellEnvironment(object):

	"""Mutable shell environment containing things like CWD and env vars.

	Environment variables are not implemented, but cwd tracking is.
	"""

	def __init__(self, cwd, env):
	self.cwd = cwd
	self.env = dict(env)

	class TimeoutHelper(object):
	"""
	Object used to helper manage enforcing a timeout in
	_executeShCmd(). It is passed through recursive calls
	to collect processes that have been executed so that when
	the timeout happens they can be killed.
	"""
	def __init__(self, timeout):
	self.timeout = timeout
	self._procs = []
	self._timeoutReached = False
	self._doneKillPass = False
	# This lock will be used to protect concurrent access
	# to _procs and _doneKillPass
	self._lock = None
	self._timer = None

	def cancel(self):
	if not self.active():
	return
	self._timer.cancel()

	def active(self):
	return self.timeout > 0

	def addProcess(self, proc):
	if not self.active():
	return
	needToRunKill = False
	with self._lock:
	self._procs.append(proc)
	# Avoid re-entering the lock by finding out if kill needs to be run
	# again here but call it if necessary once we have left the lock.
	# We could use a reentrant lock here instead but this code seems
	# clearer to me.
	needToRunKill = self._doneKillPass

	# The initial call to _kill() from the timer thread already happened so
	# we need to call it again from this thread, otherwise this process
	# will be left to run even though the timeout was already hit
	if needToRunKill:
	assert self.timeoutReached()
	self._kill()

	def startTimer(self):
	if not self.active():
	return

	# Do some late initialisation that's only needed
	# if there is a timeout set
	self._lock = threading.Lock()
	self._timer = threading.Timer(self.timeout, self._handleTimeoutReached)
	self._timer.start()

	def _handleTimeoutReached(self):
	self._timeoutReached = True
	self._kill()

	def timeoutReached(self):
	return self._timeoutReached

	def _kill(self):
	"""
	This method may be called multiple times as we might get unlucky
	and be in the middle of creating a new process in _executeShCmd()
	which won't yet be in ``self._procs``. By locking here and in
	addProcess() we should be able to kill processes launched after
	the initial call to _kill()
	"""
	with self._lock:
	for p in self._procs:
	lit.util.killProcessAndChildren(p.pid)
	# Empty the list and note that we've done a pass over the list
	self._procs = [] # Python2 doesn't have list.clear()
	self._doneKillPass = True

	class ShellCommandResult(object):
	"""Captures the result of an individual command."""

	def __init__(self, command, stdout, stderr, exitCode, timeoutReached,
	outputFiles = []):
	self.command = command
	self.stdout = stdout
	self.stderr = stderr
	self.exitCode = exitCode
	self.timeoutReached = timeoutReached
	self.outputFiles = list(outputFiles)

	def executeShCmd(cmd, shenv, results, timeout=0):
	"""
	Wrapper around _executeShCmd that handles
	timeout
	"""
	# Use the helper even when no timeout is required to make
	# other code simpler (i.e. avoid bunch of ``!= None`` checks)
	timeoutHelper = TimeoutHelper(timeout)
	if timeout > 0:
	timeoutHelper.startTimer()
	finalExitCode = _executeShCmd(cmd, shenv, results, timeoutHelper)
	timeoutHelper.cancel()
	timeoutInfo = None
	if timeoutHelper.timeoutReached():
	timeoutInfo = 'Reached timeout of {} seconds'.format(timeout)

	return (finalExitCode, timeoutInfo)

	def expand_glob(arg, cwd):
	if isinstance(arg, GlobItem):
	return arg.resolve(cwd)
	return [arg]

	def expand_glob_expressions(args, cwd):
	result = [args[0]]
	for arg in args[1:]:
	result.extend(expand_glob(arg, cwd))
	return result

	def quote_windows_command(seq):
	"""
	Reimplement Python's private subprocess.list2cmdline for MSys compatibility

	Based on CPython implementation here:
	https://hg.python.org/cpython/file/849826a900d2/Lib/subprocess.py#l422

	Some core util distributions (MSys) don't tokenize command line arguments
	the same way that MSVC CRT does. Lit rolls its own quoting logic similar to
	the stock CPython logic to paper over these quoting and tokenization rule
	differences.

	We use the same algorithm from MSDN as CPython
	(http://msdn.microsoft.com/en-us/library/17w5ykft.aspx), but we treat more
	characters as needing quoting, such as double quotes themselves.
	"""
	result = []
	needquote = False
	for arg in seq:
	bs_buf = []

	# Add a space to separate this argument from the others
	if result:
	result.append(' ')

	# This logic differs from upstream list2cmdline.
	needquote = (" " in arg) or ("\t" in arg) or ("\"" in arg) or not arg
	if needquote:
	result.append('"')

	for c in arg:
	if c == '\\':
	# Don't know if we need to double yet.
	bs_buf.append(c)
	elif c == '"':
	# Double backslashes.
	result.append('\\' * len(bs_buf)*2)
	bs_buf = []
	result.append('\\"')
	else:
	# Normal char
	if bs_buf:
	result.extend(bs_buf)
	bs_buf = []
	result.append(c)

	# Add remaining backslashes, if any.
	if bs_buf:
	result.extend(bs_buf)

	if needquote:
	result.extend(bs_buf)
	result.append('"')

	return ''.join(result)

	# cmd is export or env
	def updateEnv(env, cmd):
	arg_idx = 1
	for arg_idx, arg in enumerate(cmd.args[1:]):
	# Partition the string into KEY=VALUE.
	key, eq, val = arg.partition('=')
	# Stop if there was no equals.
	if eq == '':
	break
	env.env[key] = val
	cmd.args = cmd.args[arg_idx+1:]

	def executeBuiltinEcho(cmd, shenv):
	"""Interpret a redirected echo command"""
	opened_files = []
	stdin, stdout, stderr = processRedirects(cmd, subprocess.PIPE, shenv,
	opened_files)
	if stdin != subprocess.PIPE or stderr != subprocess.PIPE:
	raise InternalShellError(
	cmd, "stdin and stderr redirects not supported for echo")

	# Some tests have un-redirected echo commands to help debug test failures.
	# Buffer our output and return it to the caller.
	is_redirected = True
	if stdout == subprocess.PIPE:
	is_redirected = False
	stdout = StringIO()
	elif kIsWindows:
	# Reopen stdout in binary mode to avoid CRLF translation. The versions
	# of echo we are replacing on Windows all emit plain LF, and the LLVM
	# tests now depend on this.
	stdout = open(stdout.name, stdout.mode + 'b')
	opened_files.append((None, None, stdout, None))

	# Implement echo flags. We only support -e and -n, and not yet in
	# combination. We have to ignore unknown flags, because `echo "-D FOO"`
	# prints the dash.
	args = cmd.args[1:]
	interpret_escapes = False
	write_newline = True
	while len(args) >= 1 and args[0] in ('-e', '-n'):
	flag = args[0]
	args = args[1:]
	if flag == '-e':
	interpret_escapes = True
	elif flag == '-n':
	write_newline = False

	def maybeUnescape(arg):
	if not interpret_escapes:
	return arg
	# Python string escapes and "echo" escapes are obviously different, but
	# this should be enough for the LLVM test suite.
	return arg.decode('string_escape')

	if args:
	for arg in args[:-1]:
	stdout.write(maybeUnescape(arg))
	stdout.write(' ')
	stdout.write(maybeUnescape(args[-1]))
	if write_newline:
	stdout.write('\n')

	for (name, mode, f, path) in opened_files:
	f.close()

	if not is_redirected:
	return stdout.getvalue()
	return ""

	def processRedirects(cmd, stdin_source, cmd_shenv, opened_files):
	"""Return the standard fds for cmd after applying redirects

	Returns the three standard file descriptors for the new child process. Each
	fd may be an open, writable file object or a sentinel value from the
	subprocess module.
	"""

	# Apply the redirections, we use (N,) as a sentinel to indicate stdin,
	# stdout, stderr for N equal to 0, 1, or 2 respectively. Redirects to or
	# from a file are represented with a list [file, mode, file-object]
	# where file-object is initially None.
	redirects = [(0,), (1,), (2,)]
	for (op, filename) in cmd.redirects:
	if op == ('>',2):
	redirects[2] = [filename, 'w', None]
	elif op == ('>>',2):
	redirects[2] = [filename, 'a', None]
	elif op == ('>&',2) and filename in '012':
	redirects[2] = redirects[int(filename)]
	elif op == ('>&',) or op == ('&>',):
	redirects[1] = redirects[2] = [filename, 'w', None]
	elif op == ('>',):
	redirects[1] = [filename, 'w', None]
	elif op == ('>>',):
	redirects[1] = [filename, 'a', None]
	elif op == ('<',):
	redirects[0] = [filename, 'r', None]
	else:
	- raise InternalShellError(cmd, "Unsupported redirect: %r" % (r,))
	+ raise InternalShellError(cmd, "Unsupported redirect: %r" % ((op, filename),))

	# Open file descriptors in a second pass.
	std_fds = [None, None, None]
	for (index, r) in enumerate(redirects):
	# Handle the sentinel values for defaults up front.
	if isinstance(r, tuple):
	if r == (0,):
	fd = stdin_source
	elif r == (1,):
	if index == 0:
	raise InternalShellError(cmd, "Unsupported redirect for stdin")
	elif index == 1:
	fd = subprocess.PIPE
	else:
	fd = subprocess.STDOUT
	elif r == (2,):
	if index != 2:
	raise InternalShellError(cmd, "Unsupported redirect on stdout")
	fd = subprocess.PIPE
	else:
	raise InternalShellError(cmd, "Bad redirect")
	std_fds[index] = fd
	continue

	(filename, mode, fd) = r

	# Check if we already have an open fd. This can happen if stdout and
	# stderr go to the same place.
	if fd is not None:
	std_fds[index] = fd
	continue

	redir_filename = None
	name = expand_glob(filename, cmd_shenv.cwd)
	if len(name) != 1:
	raise InternalShellError(cmd, "Unsupported: glob in "
	"redirect expanded to multiple files")
	name = name[0]
	if kAvoidDevNull and name == '/dev/null':
	fd = tempfile.TemporaryFile(mode=mode)
	elif kIsWindows and name == '/dev/tty':
	# Simulate /dev/tty on Windows.
	# "CON" is a special filename for the console.
	fd = open("CON", mode)
	else:
	# Make sure relative paths are relative to the cwd.
	redir_filename = os.path.join(cmd_shenv.cwd, name)
	fd = open(redir_filename, mode)
	# Workaround a Win32 and/or subprocess bug when appending.
	#
	# FIXME: Actually, this is probably an instance of PR6753.
	if mode == 'a':
	fd.seek(0, 2)
	# Mutate the underlying redirect list so that we can redirect stdout
	# and stderr to the same place without opening the file twice.
	r[2] = fd
	opened_files.append((filename, mode, fd) + (redir_filename,))
	std_fds[index] = fd

	return std_fds

	def _executeShCmd(cmd, shenv, results, timeoutHelper):
	if timeoutHelper.timeoutReached():
	# Prevent further recursion if the timeout has been hit
	# as we should try avoid launching more processes.
	return None

	if isinstance(cmd, ShUtil.Seq):
	if cmd.op == ';':
	res = _executeShCmd(cmd.lhs, shenv, results, timeoutHelper)
	return _executeShCmd(cmd.rhs, shenv, results, timeoutHelper)

	if cmd.op == '&':
	raise InternalShellError(cmd,"unsupported shell operator: '&'")

	if cmd.op == '\|\|':
	res = _executeShCmd(cmd.lhs, shenv, results, timeoutHelper)
	if res != 0:
	res = _executeShCmd(cmd.rhs, shenv, results, timeoutHelper)
	return res

	if cmd.op == '&&':
	res = _executeShCmd(cmd.lhs, shenv, results, timeoutHelper)
	if res is None:
	return res

	if res == 0:
	res = _executeShCmd(cmd.rhs, shenv, results, timeoutHelper)
	return res

	raise ValueError('Unknown shell command: %r' % cmd.op)
	assert isinstance(cmd, ShUtil.Pipeline)

	# Handle shell builtins first.
	if cmd.commands[0].args[0] == 'cd':
	if len(cmd.commands) != 1:
	raise ValueError("'cd' cannot be part of a pipeline")
	if len(cmd.commands[0].args) != 2:
	raise ValueError("'cd' supports only one argument")
	newdir = cmd.commands[0].args[1]
	# Update the cwd in the parent environment.
	if os.path.isabs(newdir):
	shenv.cwd = newdir
	else:
	shenv.cwd = os.path.realpath(os.path.join(shenv.cwd, newdir))
	# The cd builtin always succeeds. If the directory does not exist, the
	# following Popen calls will fail instead.
	return 0

	# Handle "echo" as a builtin if it is not part of a pipeline. This greatly
	# speeds up tests that construct input files by repeatedly echo-appending to
	# a file.
	# FIXME: Standardize on the builtin echo implementation. We can use a
	# temporary file to sidestep blocking pipe write issues.
	if cmd.commands[0].args[0] == 'echo' and len(cmd.commands) == 1:
	output = executeBuiltinEcho(cmd.commands[0], shenv)
	results.append(ShellCommandResult(cmd.commands[0], output, "", 0,
	False))
	return 0

	if cmd.commands[0].args[0] == 'export':
	if len(cmd.commands) != 1:
	raise ValueError("'export' cannot be part of a pipeline")
	if len(cmd.commands[0].args) != 2:
	raise ValueError("'export' supports only one argument")
	updateEnv(shenv, cmd.commands[0])
	return 0

	procs = []
	default_stdin = subprocess.PIPE
	stderrTempFiles = []
	opened_files = []
	named_temp_files = []
	# To avoid deadlock, we use a single stderr stream for piped
	# output. This is null until we have seen some output using
	# stderr.
	for i,j in enumerate(cmd.commands):
	# Reference the global environment by default.
	cmd_shenv = shenv
	if j.args[0] == 'env':
	# Create a copy of the global environment and modify it for this one
	# command. There might be multiple envs in a pipeline:
	# env FOO=1 llc < %s \| env BAR=2 llvm-mc \| FileCheck %s
	cmd_shenv = ShellEnvironment(shenv.cwd, shenv.env)
	updateEnv(cmd_shenv, j)

	stdin, stdout, stderr = processRedirects(j, default_stdin, cmd_shenv,
	opened_files)

	# If stderr wants to come from stdout, but stdout isn't a pipe, then put
	# stderr on a pipe and treat it as stdout.
	if (stderr == subprocess.STDOUT and stdout != subprocess.PIPE):
	stderr = subprocess.PIPE
	stderrIsStdout = True
	else:
	stderrIsStdout = False

	# Don't allow stderr on a PIPE except for the last
	# process, this could deadlock.
	#
	# FIXME: This is slow, but so is deadlock.
	if stderr == subprocess.PIPE and j != cmd.commands[-1]:
	stderr = tempfile.TemporaryFile(mode='w+b')
	stderrTempFiles.append((i, stderr))

	# Resolve the executable path ourselves.
	args = list(j.args)
	executable = None
	# For paths relative to cwd, use the cwd of the shell environment.
	if args[0].startswith('.'):
	exe_in_cwd = os.path.join(cmd_shenv.cwd, args[0])
	if os.path.isfile(exe_in_cwd):
	executable = exe_in_cwd
	if not executable:
	executable = lit.util.which(args[0], cmd_shenv.env['PATH'])
	if not executable:
	raise InternalShellError(j, '%r: command not found' % j.args[0])

	# Replace uses of /dev/null with temporary files.
	if kAvoidDevNull:
	for i,arg in enumerate(args):
	if arg == "/dev/null":
	f = tempfile.NamedTemporaryFile(delete=False)
	f.close()
	named_temp_files.append(f.name)
	args[i] = f.name

	# Expand all glob expressions
	args = expand_glob_expressions(args, cmd_shenv.cwd)

	# On Windows, do our own command line quoting for better compatibility
	# with some core utility distributions.
	if kIsWindows:
	args = quote_windows_command(args)

	try:
	procs.append(subprocess.Popen(args, cwd=cmd_shenv.cwd,
	executable = executable,
	stdin = stdin,
	stdout = stdout,
	stderr = stderr,
	env = cmd_shenv.env,
	close_fds = kUseCloseFDs))
	# Let the helper know about this process
	timeoutHelper.addProcess(procs[-1])
	except OSError as e:
	raise InternalShellError(j, 'Could not create process ({}) due to {}'.format(executable, e))

	# Immediately close stdin for any process taking stdin from us.
	if stdin == subprocess.PIPE:
	procs[-1].stdin.close()
	procs[-1].stdin = None

	# Update the current stdin source.
	if stdout == subprocess.PIPE:
	default_stdin = procs[-1].stdout
	elif stderrIsStdout:
	default_stdin = procs[-1].stderr
	else:
	default_stdin = subprocess.PIPE

	# Explicitly close any redirected files. We need to do this now because we
	# need to release any handles we may have on the temporary files (important
	# on Win32, for example). Since we have already spawned the subprocess, our
	# handles have already been transferred so we do not need them anymore.
	for (name, mode, f, path) in opened_files:
	f.close()

	# FIXME: There is probably still deadlock potential here. Yawn.
	procData = [None] * len(procs)
	procData[-1] = procs[-1].communicate()

	for i in range(len(procs) - 1):
	if procs[i].stdout is not None:
	out = procs[i].stdout.read()
	else:
	out = ''
	if procs[i].stderr is not None:
	err = procs[i].stderr.read()
	else:
	err = ''
	procData[i] = (out,err)

	# Read stderr out of the temp files.
	for i,f in stderrTempFiles:
	f.seek(0, 0)
	procData[i] = (procData[i][0], f.read())

	def to_string(bytes):
	if isinstance(bytes, str):
	return bytes
	return bytes.encode('utf-8')

	exitCode = None
	for i,(out,err) in enumerate(procData):
	res = procs[i].wait()
	# Detect Ctrl-C in subprocess.
	if res == -signal.SIGINT:
	raise KeyboardInterrupt

	# Ensure the resulting output is always of string type.
	try:
	if out is None:
	out = ''
	else:
	out = to_string(out.decode('utf-8', errors='replace'))
	except:
	out = str(out)
	try:
	if err is None:
	err = ''
	else:
	err = to_string(err.decode('utf-8', errors='replace'))
	except:
	err = str(err)

	# Gather the redirected output files for failed commands.
	output_files = []
	if res != 0:
	for (name, mode, f, path) in sorted(opened_files):
	if path is not None and mode in ('w', 'a'):
	try:
	with open(path, 'rb') as f:
	data = f.read()
	except:
	data = None
	if data is not None:
	output_files.append((name, path, data))

	results.append(ShellCommandResult(
	cmd.commands[i], out, err, res, timeoutHelper.timeoutReached(),
	output_files))
	if cmd.pipe_err:
	# Take the last failing exit code from the pipeline.
	if not exitCode or res != 0:
	exitCode = res
	else:
	exitCode = res

	# Remove any named temporary files we created.
	for f in named_temp_files:
	try:
	os.remove(f)
	except OSError:
	pass

	if cmd.negate:
	exitCode = not exitCode

	return exitCode

	def executeScriptInternal(test, litConfig, tmpBase, commands, cwd):
	cmds = []
	for ln in commands:
	try:
	cmds.append(ShUtil.ShParser(ln, litConfig.isWindows,
	test.config.pipefail).parse())
	except:
	return lit.Test.Result(Test.FAIL, "shell parser error on: %r" % ln)

	cmd = cmds[0]
	for c in cmds[1:]:
	cmd = ShUtil.Seq(cmd, '&&', c)

	results = []
	timeoutInfo = None
	try:
	shenv = ShellEnvironment(cwd, test.config.environment)
	exitCode, timeoutInfo = executeShCmd(cmd, shenv, results, timeout=litConfig.maxIndividualTestTime)
	except InternalShellError:
	e = sys.exc_info()[1]
	exitCode = 127
	results.append(
	ShellCommandResult(e.command, '', e.message, exitCode, False))

	out = err = ''
	for i,result in enumerate(results):
	# Write the command line run.
	out += '$ %s\n' % (' '.join('"%s"' % s
	for s in result.command.args),)

	# If nothing interesting happened, move on.
	if litConfig.maxIndividualTestTime == 0 and \
	result.exitCode == 0 and \
	not result.stdout.strip() and not result.stderr.strip():
	continue

	# Otherwise, something failed or was printed, show it.

	# Add the command output, if redirected.
	for (name, path, data) in result.outputFiles:
	if data.strip():
	out += "# redirected output from %r:\n" % (name,)
	data = to_string(data.decode('utf-8', errors='replace'))
	if len(data) > 1024:
	out += data[:1024] + "\n...\n"
	out += "note: data was truncated\n"
	else:
	out += data
	out += "\n"

	if result.stdout.strip():
	out += '# command output:\n%s\n' % (result.stdout,)
	if result.stderr.strip():
	out += '# command stderr:\n%s\n' % (result.stderr,)
	if not result.stdout.strip() and not result.stderr.strip():
	out += "note: command had no output on stdout or stderr\n"

	# Show the error conditions:
	if result.exitCode != 0:
	# On Windows, a negative exit code indicates a signal, and those are
	# easier to recognize or look up if we print them in hex.
	if litConfig.isWindows and result.exitCode < 0:
	codeStr = hex(int(result.exitCode & 0xFFFFFFFF)).rstrip("L")
	else:
	codeStr = str(result.exitCode)
	out += "error: command failed with exit status: %s\n" % (
	codeStr,)
	if litConfig.maxIndividualTestTime > 0:
	out += 'error: command reached timeout: %s\n' % (
	str(result.timeoutReached),)

	return out, err, exitCode, timeoutInfo

	def executeScript(test, litConfig, tmpBase, commands, cwd):
	bashPath = litConfig.getBashPath()
	isWin32CMDEXE = (litConfig.isWindows and not bashPath)
	script = tmpBase + '.script'
	if isWin32CMDEXE:
	script += '.bat'

	# Write script file
	mode = 'w'
	if litConfig.isWindows and not isWin32CMDEXE:
	mode += 'b' # Avoid CRLFs when writing bash scripts.
	f = open(script, mode)
	if isWin32CMDEXE:
	f.write('\nif %ERRORLEVEL% NEQ 0 EXIT\n'.join(commands))
	else:
	if test.config.pipefail:
	f.write('set -o pipefail;')
	if litConfig.echo_all_commands:
	f.write('set -x;')
	f.write('{ ' + '; } &&\n{ '.join(commands) + '; }')
	f.write('\n')
	f.close()

	if isWin32CMDEXE:
	command = ['cmd','/c', script]
	else:
	if bashPath:
	command = [bashPath, script]
	else:
	command = ['/bin/sh', script]
	if litConfig.useValgrind:
	# FIXME: Running valgrind on sh is overkill. We probably could just
	# run on clang with no real loss.
	command = litConfig.valgrindArgs + command

	try:
	out, err, exitCode = lit.util.executeCommand(command, cwd=cwd,
	env=test.config.environment,
	timeout=litConfig.maxIndividualTestTime)
	return (out, err, exitCode, None)
	except lit.util.ExecuteCommandTimeoutException as e:
	return (e.out, e.err, e.exitCode, e.msg)

	def parseIntegratedTestScriptCommands(source_path, keywords):
	"""
	parseIntegratedTestScriptCommands(source_path) -> commands

	Parse the commands in an integrated test script file into a list of
	(line_number, command_type, line).
	"""

	# This code is carefully written to be dual compatible with Python 2.5+ and
	# Python 3 without requiring input files to always have valid codings. The
	# trick we use is to open the file in binary mode and use the regular
	# expression library to find the commands, with it scanning strings in
	# Python2 and bytes in Python3.
	#
	# Once we find a match, we do require each script line to be decodable to
	# UTF-8, so we convert the outputs to UTF-8 before returning. This way the
	# remaining code can work with "strings" agnostic of the executing Python
	# version.

	keywords_re = re.compile(
	to_bytes("(%s)(.*)\n" % ("\|".join(re.escape(k) for k in keywords),)))

	f = open(source_path, 'rb')
	try:
	# Read the entire file contents.
	data = f.read()

	# Ensure the data ends with a newline.
	if not data.endswith(to_bytes('\n')):
	data = data + to_bytes('\n')

	# Iterate over the matches.
	line_number = 1
	last_match_position = 0
	for match in keywords_re.finditer(data):
	# Compute the updated line number by counting the intervening
	# newlines.
	match_position = match.start()
	line_number += data.count(to_bytes('\n'), last_match_position,
	match_position)
	last_match_position = match_position

	# Convert the keyword and line to UTF-8 strings and yield the
	# command. Note that we take care to return regular strings in
	# Python 2, to avoid other code having to differentiate between the
	# str and unicode types.
	keyword,ln = match.groups()
	yield (line_number, to_string(keyword.decode('utf-8')),
	to_string(ln.decode('utf-8')))
	finally:
	f.close()

	def getTempPaths(test):
	"""Get the temporary location, this is always relative to the test suite
	root, not test source root."""
	execpath = test.getExecPath()
	execdir,execbase = os.path.split(execpath)
	tmpDir = os.path.join(execdir, 'Output')
	tmpBase = os.path.join(tmpDir, execbase)
	return tmpDir, tmpBase

	def getDefaultSubstitutions(test, tmpDir, tmpBase, normalize_slashes=False):
	sourcepath = test.getSourcePath()
	sourcedir = os.path.dirname(sourcepath)

	# Normalize slashes, if requested.
	if normalize_slashes:
	sourcepath = sourcepath.replace('\\', '/')
	sourcedir = sourcedir.replace('\\', '/')
	tmpDir = tmpDir.replace('\\', '/')
	tmpBase = tmpBase.replace('\\', '/')

	# We use #_MARKER_# to hide %% while we do the other substitutions.
	substitutions = []
	substitutions.extend([('%%', '#_MARKER_#')])
	substitutions.extend(test.config.substitutions)
	tmpName = tmpBase + '.tmp'
	baseName = os.path.basename(tmpBase)
	substitutions.extend([('%s', sourcepath),
	('%S', sourcedir),
	('%p', sourcedir),
	('%{pathsep}', os.pathsep),
	('%t', tmpName),
	('%basename_t', baseName),
	('%T', tmpDir),
	('#_MARKER_#', '%')])

	# "%/[STpst]" should be normalized.
	substitutions.extend([
	('%/s', sourcepath.replace('\\', '/')),
	('%/S', sourcedir.replace('\\', '/')),
	('%/p', sourcedir.replace('\\', '/')),
	('%/t', tmpBase.replace('\\', '/') + '.tmp'),
	('%/T', tmpDir.replace('\\', '/')),
	])

	# "%:[STpst]" are paths without colons.
	if kIsWindows:
	substitutions.extend([
	('%:s', re.sub(r'^(.):', r'\1', sourcepath)),
	('%:S', re.sub(r'^(.):', r'\1', sourcedir)),
	('%:p', re.sub(r'^(.):', r'\1', sourcedir)),
	('%:t', re.sub(r'^(.):', r'\1', tmpBase) + '.tmp'),
	('%:T', re.sub(r'^(.):', r'\1', tmpDir)),
	])
	else:
	substitutions.extend([
	('%:s', sourcepath),
	('%:S', sourcedir),
	('%:p', sourcedir),
	('%:t', tmpBase + '.tmp'),
	('%:T', tmpDir),
	])
	return substitutions

	def applySubstitutions(script, substitutions):
	"""Apply substitutions to the script. Allow full regular expression syntax.
	Replace each matching occurrence of regular expression pattern a with
	substitution b in line ln."""
	def processLine(ln):
	# Apply substitutions
	for a,b in substitutions:
	if kIsWindows:
	b = b.replace("\\","\\\\")
	ln = re.sub(a, b, ln)

	# Strip the trailing newline and any extra whitespace.
	return ln.strip()
	# Note Python 3 map() gives an iterator rather than a list so explicitly
	# convert to list before returning.
	return list(map(processLine, script))


	class ParserKind(object):
	"""
	An enumeration representing the style of an integrated test keyword or
	command.

	TAG: A keyword taking no value. Ex 'END.'
	COMMAND: A keyword taking a list of shell commands. Ex 'RUN:'
	LIST: A keyword taking a comma-separated list of values.
	BOOLEAN_EXPR: A keyword taking a comma-separated list of
	boolean expressions. Ex 'XFAIL:'
	CUSTOM: A keyword with custom parsing semantics.
	"""
	TAG = 0
	COMMAND = 1
	LIST = 2
	BOOLEAN_EXPR = 3
	CUSTOM = 4

	@staticmethod
	def allowedKeywordSuffixes(value):
	return { ParserKind.TAG: ['.'],
	ParserKind.COMMAND: [':'],
	ParserKind.LIST: [':'],
	ParserKind.BOOLEAN_EXPR: [':'],
	ParserKind.CUSTOM: [':', '.']
	} [value]

	@staticmethod
	def str(value):
	return { ParserKind.TAG: 'TAG',
	ParserKind.COMMAND: 'COMMAND',
	ParserKind.LIST: 'LIST',
	ParserKind.BOOLEAN_EXPR: 'BOOLEAN_EXPR',
	ParserKind.CUSTOM: 'CUSTOM'
	} [value]


	class IntegratedTestKeywordParser(object):
	"""A parser for LLVM/Clang style integrated test scripts.

	keyword: The keyword to parse for. It must end in either '.' or ':'.
	kind: An value of ParserKind.
	parser: A custom parser. This value may only be specified with
	ParserKind.CUSTOM.
	"""
	def __init__(self, keyword, kind, parser=None, initial_value=None):
	allowedSuffixes = ParserKind.allowedKeywordSuffixes(kind)
	if len(keyword) == 0 or keyword[-1] not in allowedSuffixes:
	if len(allowedSuffixes) == 1:
	raise ValueError("Keyword '%s' of kind '%s' must end in '%s'"
	% (keyword, ParserKind.str(kind),
	allowedSuffixes[0]))
	else:
	raise ValueError("Keyword '%s' of kind '%s' must end in "
	" one of '%s'"
	% (keyword, ParserKind.str(kind),
	' '.join(allowedSuffixes)))

	if parser is not None and kind != ParserKind.CUSTOM:
	raise ValueError("custom parsers can only be specified with "
	"ParserKind.CUSTOM")
	self.keyword = keyword
	self.kind = kind
	self.parsed_lines = []
	self.value = initial_value
	self.parser = parser

	if kind == ParserKind.COMMAND:
	self.parser = self._handleCommand
	elif kind == ParserKind.LIST:
	self.parser = self._handleList
	elif kind == ParserKind.BOOLEAN_EXPR:
	self.parser = self._handleBooleanExpr
	elif kind == ParserKind.TAG:
	self.parser = self._handleTag
	elif kind == ParserKind.CUSTOM:
	if parser is None:
	raise ValueError("ParserKind.CUSTOM requires a custom parser")
	self.parser = parser
	else:
	raise ValueError("Unknown kind '%s'" % kind)

	def parseLine(self, line_number, line):
	try:
	self.parsed_lines += [(line_number, line)]
	self.value = self.parser(line_number, line, self.value)
	except ValueError as e:
	raise ValueError(str(e) + ("\nin %s directive on test line %d" %
	(self.keyword, line_number)))

	def getValue(self):
	return self.value

	@staticmethod
	def _handleTag(line_number, line, output):
	"""A helper for parsing TAG type keywords"""
	return (not line.strip() or output)

	@staticmethod
	def _handleCommand(line_number, line, output):
	"""A helper for parsing COMMAND type keywords"""
	# Trim trailing whitespace.
	line = line.rstrip()
	# Substitute line number expressions
	line = re.sub('%$line$', str(line_number), line)

	def replace_line_number(match):
	if match.group(1) == '+':
	return str(line_number + int(match.group(2)))
	if match.group(1) == '-':
	return str(line_number - int(match.group(2)))
	line = re.sub('%$line ([\+-]) (\d+)$', replace_line_number, line)
	# Collapse lines with trailing '\\'.
	if output and output[-1][-1] == '\\':
	output[-1] = output[-1][:-1] + line
	else:
	if output is None:
	output = []
	output.append(line)
	return output

	@staticmethod
	def _handleList(line_number, line, output):
	"""A parser for LIST type keywords"""
	if output is None:
	output = []
	output.extend([s.strip() for s in line.split(',')])
	return output

	@staticmethod
	def _handleBooleanExpr(line_number, line, output):
	"""A parser for BOOLEAN_EXPR type keywords"""
	if output is None:
	output = []
	output.extend([s.strip() for s in line.split(',')])
	# Evaluate each expression to verify syntax.
	# We don't want any results, just the raised ValueError.
	for s in output:
	if s != '*':
	BooleanExpression.evaluate(s, [])
	return output

	@staticmethod
	def _handleRequiresAny(line_number, line, output):
	"""A custom parser to transform REQUIRES-ANY: into REQUIRES:"""

	# Extract the conditions specified in REQUIRES-ANY: as written.
	conditions = []
	IntegratedTestKeywordParser._handleList(line_number, line, conditions)

	# Output a `REQUIRES: a \|\| b \|\| c` expression in its place.
	expression = ' \|\| '.join(conditions)
	IntegratedTestKeywordParser._handleBooleanExpr(line_number,
	expression, output)
	return output

	def parseIntegratedTestScript(test, additional_parsers=[],
	require_script=True):
	"""parseIntegratedTestScript - Scan an LLVM/Clang style integrated test
	script and extract the lines to 'RUN' as well as 'XFAIL' and 'REQUIRES'
	and 'UNSUPPORTED' information.

	If additional parsers are specified then the test is also scanned for the
	keywords they specify and all matches are passed to the custom parser.

	If 'require_script' is False an empty script
	may be returned. This can be used for test formats where the actual script
	is optional or ignored.
	"""

	# Install the built-in keyword parsers.
	script = []
	builtin_parsers = [
	IntegratedTestKeywordParser('RUN:', ParserKind.COMMAND,
	initial_value=script),
	IntegratedTestKeywordParser('XFAIL:', ParserKind.BOOLEAN_EXPR,
	initial_value=test.xfails),
	IntegratedTestKeywordParser('REQUIRES:', ParserKind.BOOLEAN_EXPR,
	initial_value=test.requires),
	IntegratedTestKeywordParser('REQUIRES-ANY:', ParserKind.CUSTOM,
	IntegratedTestKeywordParser._handleRequiresAny,
	initial_value=test.requires),
	IntegratedTestKeywordParser('UNSUPPORTED:', ParserKind.BOOLEAN_EXPR,
	initial_value=test.unsupported),
	IntegratedTestKeywordParser('END.', ParserKind.TAG)
	]
	keyword_parsers = {p.keyword: p for p in builtin_parsers}

	# Install user-defined additional parsers.
	for parser in additional_parsers:
	if not isinstance(parser, IntegratedTestKeywordParser):
	raise ValueError('additional parser must be an instance of '
	'IntegratedTestKeywordParser')
	if parser.keyword in keyword_parsers:
	raise ValueError("Parser for keyword '%s' already exists"
	% parser.keyword)
	keyword_parsers[parser.keyword] = parser

	# Collect the test lines from the script.
	sourcepath = test.getSourcePath()
	for line_number, command_type, ln in \
	parseIntegratedTestScriptCommands(sourcepath,
	keyword_parsers.keys()):
	parser = keyword_parsers[command_type]
	parser.parseLine(line_number, ln)
	if command_type == 'END.' and parser.getValue() is True:
	break

	# Verify the script contains a run line.
	if require_script and not script:
	return lit.Test.Result(Test.UNRESOLVED, "Test has no run line!")

	# Check for unterminated run lines.
	if script and script[-1][-1] == '\\':
	return lit.Test.Result(Test.UNRESOLVED,
	"Test has unterminated run lines (with '\\')")

	# Enforce REQUIRES:
	missing_required_features = test.getMissingRequiredFeatures()
	if missing_required_features:
	msg = ', '.join(missing_required_features)
	return lit.Test.Result(Test.UNSUPPORTED,
	"Test requires the following unavailable "
	"features: %s" % msg)

	# Enforce UNSUPPORTED:
	unsupported_features = test.getUnsupportedFeatures()
	if unsupported_features:
	msg = ', '.join(unsupported_features)
	return lit.Test.Result(
	Test.UNSUPPORTED,
	"Test does not support the following features "
	"and/or targets: %s" % msg)

	# Enforce limit_to_features.
	if not test.isWithinFeatureLimits():
	msg = ', '.join(test.config.limit_to_features)
	return lit.Test.Result(Test.UNSUPPORTED,
	"Test does not require any of the features "
	"specified in limit_to_features: %s" % msg)

	return script


	def _runShTest(test, litConfig, useExternalSh, script, tmpBase):
	# Create the output directory if it does not already exist.
	lit.util.mkdir_p(os.path.dirname(tmpBase))

	execdir = os.path.dirname(test.getExecPath())
	if useExternalSh:
	res = executeScript(test, litConfig, tmpBase, script, execdir)
	else:
	res = executeScriptInternal(test, litConfig, tmpBase, script, execdir)
	if isinstance(res, lit.Test.Result):
	return res

	out,err,exitCode,timeoutInfo = res
	if exitCode == 0:
	status = Test.PASS
	else:
	if timeoutInfo is None:
	status = Test.FAIL
	else:
	status = Test.TIMEOUT

	# Form the output log.
	output = """Script:\n--\n%s\n--\nExit Code: %d\n""" % (
	'\n'.join(script), exitCode)

	if timeoutInfo is not None:
	output += """Timeout: %s\n""" % (timeoutInfo,)
	output += "\n"

	# Append the outputs, if present.
	if out:
	output += """Command Output (stdout):\n--\n%s\n--\n""" % (out,)
	if err:
	output += """Command Output (stderr):\n--\n%s\n--\n""" % (err,)

	return lit.Test.Result(status, output)


	def executeShTest(test, litConfig, useExternalSh,
	extra_substitutions=[]):
	if test.config.unsupported:
	return lit.Test.Result(Test.UNSUPPORTED, 'Test is unsupported')

	script = parseIntegratedTestScript(test)
	if isinstance(script, lit.Test.Result):
	return script
	if litConfig.noExecute:
	return lit.Test.Result(Test.PASS)

	tmpDir, tmpBase = getTempPaths(test)
	substitutions = list(extra_substitutions)
	substitutions += getDefaultSubstitutions(test, tmpDir, tmpBase,
	normalize_slashes=useExternalSh)
	script = applySubstitutions(script, substitutions)

	# Re-run failed tests up to test_retry_attempts times.
	attempts = 1
	if hasattr(test.config, 'test_retry_attempts'):
	attempts += test.config.test_retry_attempts
	for i in range(attempts):
	res = _runShTest(test, litConfig, useExternalSh, script, tmpBase)
	if res.code != Test.FAIL:
	break
	# If we had to run the test more than once, count it as a flaky pass. These
	# will be printed separately in the test summary.
	if i > 0 and res.code == Test.PASS:
	res.code = Test.FLAKYPASS
	return res
	diff --git a/utils/lit/lit/formats/__init__.py b/utils/lit/lit/formats/__init__.py
	index 7d14ca4b535a..3ff46e93ead2 100644
	--- a/utils/lit/lit/formats/__init__.py
	+++ b/utils/lit/lit/formats/__init__.py
	@@ -1,3 +1,8 @@
	-from lit.formats.base import TestFormat # noqa: F401
	+from lit.formats.base import ( # noqa: F401
	+ TestFormat,
	+ FileBasedTest,
	+ OneCommandPerFileTest
	+)
	+
	from lit.formats.googletest import GoogleTest # noqa: F401
	from lit.formats.shtest import ShTest # noqa: F401
	diff --git a/utils/lit/lit/formats/base.py b/utils/lit/lit/formats/base.py
	index baa9ff1d3b7d..6721d17e334e 100644
	--- a/utils/lit/lit/formats/base.py
	+++ b/utils/lit/lit/formats/base.py
	@@ -1,50 +1,117 @@
	-import abc
	+from __future__ import absolute_import
	+import os
	+
	+import lit.Test
	+import lit.util

	class TestFormat(object):
	- """Base class for test formats.
	-
	- A TestFormat encapsulates logic for finding and executing a certain type of
	- test. For example, a subclass FooTestFormat would contain the logic for
	- finding tests written in the 'Foo' format, and the logic for running a
	- single one.
	-
	- TestFormat is an Abstract Base Class (ABC). It uses the Python abc.ABCMeta
	- type and associated @abc.abstractmethod decorator. Together, these provide
	- subclass behaviour which is notionally similar to C++ pure virtual classes:
	- only subclasses which implement all abstract methods can be instantiated
	- (the implementation may come from an intermediate base).
	-
	- For details on ABCs, see: https://docs.python.org/2/library/abc.html. Note
	- that Python ABCs have extensive abilities beyond what is used here. For
	- TestFormat, we only care about enforcing that abstract methods are
	- implemented.
	- """
	-
	- __metaclass__ = abc.ABCMeta
	-
	- @abc.abstractmethod
	- def getTestsInDirectory(self, testSuite, path_in_suite, litConfig,
	- localConfig):
	- """Finds tests of this format in the given directory.
	-
	- Args:
	- testSuite: a Test.TestSuite object.
	- path_in_suite: the subpath under testSuite to look for tests.
	- litConfig: the LitConfig for the test suite.
	- localConfig: a LitConfig with local specializations.
	-
	- Returns:
	- An iterable of Test.Test objects.
	- """
	-
	- @abc.abstractmethod
	+ pass
	+
	+###
	+
	+class FileBasedTest(TestFormat):
	+ def getTestsInDirectory(self, testSuite, path_in_suite,
	+ litConfig, localConfig):
	+ source_path = testSuite.getSourcePath(path_in_suite)
	+ for filename in os.listdir(source_path):
	+ # Ignore dot files and excluded tests.
	+ if (filename.startswith('.') or
	+ filename in localConfig.excludes):
	+ continue
	+
	+ filepath = os.path.join(source_path, filename)
	+ if not os.path.isdir(filepath):
	+ base,ext = os.path.splitext(filename)
	+ if ext in localConfig.suffixes:
	+ yield lit.Test.Test(testSuite, path_in_suite + (filename,),
	+ localConfig)
	+
	+###
	+
	+import re
	+import tempfile
	+
	+class OneCommandPerFileTest(TestFormat):
	+ # FIXME: Refactor into generic test for running some command on a directory
	+ # of inputs.
	+
	+ def __init__(self, command, dir, recursive=False,
	+ pattern=".*", useTempInput=False):
	+ if isinstance(command, str):
	+ self.command = [command]
	+ else:
	+ self.command = list(command)
	+ if dir is not None:
	+ dir = str(dir)
	+ self.dir = dir
	+ self.recursive = bool(recursive)
	+ self.pattern = re.compile(pattern)
	+ self.useTempInput = useTempInput
	+
	+ def getTestsInDirectory(self, testSuite, path_in_suite,
	+ litConfig, localConfig):
	+ dir = self.dir
	+ if dir is None:
	+ dir = testSuite.getSourcePath(path_in_suite)
	+
	+ for dirname,subdirs,filenames in os.walk(dir):
	+ if not self.recursive:
	+ subdirs[:] = []
	+
	+ subdirs[:] = [d for d in subdirs
	+ if (d != '.svn' and
	+ d not in localConfig.excludes)]
	+
	+ for filename in filenames:
	+ if (filename.startswith('.') or
	+ not self.pattern.match(filename) or
	+ filename in localConfig.excludes):
	+ continue
	+
	+ path = os.path.join(dirname,filename)
	+ suffix = path[len(dir):]
	+ if suffix.startswith(os.sep):
	+ suffix = suffix[1:]
	+ test = lit.Test.Test(
	+ testSuite, path_in_suite + tuple(suffix.split(os.sep)),
	+ localConfig)
	+ # FIXME: Hack?
	+ test.source_path = path
	+ yield test
	+
	+ def createTempInput(self, tmp, test):
	+ raise NotImplementedError('This is an abstract method.')
	+
	def execute(self, test, litConfig):
	- """Runs the given 'test', which is of this format.
	+ if test.config.unsupported:
	+ return (lit.Test.UNSUPPORTED, 'Test is unsupported')
	+
	+ cmd = list(self.command)
	+
	+ # If using temp input, create a temporary file and hand it to the
	+ # subclass.
	+ if self.useTempInput:
	+ tmp = tempfile.NamedTemporaryFile(suffix='.cpp')
	+ self.createTempInput(tmp, test)
	+ tmp.flush()
	+ cmd.append(tmp.name)
	+ elif hasattr(test, 'source_path'):
	+ cmd.append(test.source_path)
	+ else:
	+ cmd.append(test.getSourcePath())
	+
	+ out, err, exitCode = lit.util.executeCommand(cmd)
	+
	+ diags = out + err
	+ if not exitCode and not diags.strip():
	+ return lit.Test.PASS,''

	- Args:
	- test: a Test.Test object describing the test to run.
	- litConfig: the LitConfig for the test suite.
	+ # Try to include some useful information.
	+ report = """Command: %s\n""" % ' '.join(["'%s'" % a
	+ for a in cmd])
	+ if self.useTempInput:
	+ report += """Temporary File: %s\n""" % tmp.name
	+ report += "--\n%s--\n""" % open(tmp.name).read()
	+ report += """Output:\n--\n%s--""" % diags

	- Returns:
	- A tuple of (status:Test.ResultCode, message:str)
	- """
	+ return lit.Test.FAIL, report
	diff --git a/utils/lit/lit/formats/shtest.py b/utils/lit/lit/formats/shtest.py
	index 01ecd192092e..fdc9bd0241f3 100644
	--- a/utils/lit/lit/formats/shtest.py
	+++ b/utils/lit/lit/formats/shtest.py
	@@ -1,48 +1,25 @@
	from __future__ import absolute_import

	-import os
	-
	-import lit.Test
	import lit.TestRunner
	import lit.util
	-from .base import TestFormat

	-class ShTest(TestFormat):
	+from .base import FileBasedTest
	+
	+
	+class ShTest(FileBasedTest):
	"""ShTest is a format with one file per test.

	This is the primary format for regression tests as described in the LLVM
	testing guide:

	http://llvm.org/docs/TestingGuide.html

	The ShTest files contain some number of shell-like command pipelines, along
	with assertions about what should be in the output.
	"""
	-
	- def __init__(self, execute_external = False):
	- """Initializer.
	-
	- The 'execute_external' argument controls whether lit uses its internal
	- logic for command pipelines, or passes the command to a shell
	- subprocess.
	-
	- Args:
	- execute_external: (optional) If true, use shell subprocesses instead
	- of lit's internal pipeline logic.
	- """
	+ def __init__(self, execute_external=False):
	self.execute_external = execute_external

	- def getTestsInDirectory(self, testSuite, path_in_suite,
	- litConfig, localConfig):
	- """Yields test files matching 'suffixes' from the localConfig."""
	- file_matches = lit.util.listdir_files(
	- testSuite.getSourcePath(path_in_suite),
	- localConfig.suffixes, localConfig.excludes)
	- for filename in file_matches:
	- yield lit.Test.Test(testSuite, path_in_suite + (filename,),
	- localConfig)
	-
	def execute(self, test, litConfig):
	- """Interprets and runs the given test file, and returns the result."""
	return lit.TestRunner.executeShTest(test, litConfig,
	self.execute_external)
	diff --git a/utils/lit/lit/run.py b/utils/lit/lit/run.py
	index 1290c142c834..3e39bdb92203 100644
	--- a/utils/lit/lit/run.py
	+++ b/utils/lit/lit/run.py
	@@ -1,248 +1,247 @@
	import os
	import sys
	import threading
	import time
	import traceback
	try:
	import Queue as queue
	except ImportError:
	import queue

	try:
	import win32api
	except ImportError:
	win32api = None

	import multiprocessing
	import lit.Test

	def abort_now():
	"""Abort the current process without doing any exception teardown"""
	sys.stdout.flush()
	if win32api:
	win32api.TerminateProcess(win32api.GetCurrentProcess(), 3)
	else:
	os.kill(0, 9)

	class _Display(object):
	def __init__(self, display, provider, maxFailures):
	self.display = display
	self.provider = provider
	self.maxFailures = maxFailures or object()
	self.failedCount = 0
	def update(self, test):
	self.display.update(test)
	self.failedCount += (test.result.code == lit.Test.FAIL)
	if self.failedCount == self.maxFailures:
	self.provider.cancel()

	class Run(object):
	"""
	This class represents a concrete, configured testing run.
	"""

	def __init__(self, lit_config, tests):
	self.lit_config = lit_config
	self.tests = tests
	+ # Set up semaphores to limit parallelism of certain classes of tests.
	+ # For example, some ASan tests require lots of virtual memory and run
	+ # faster with less parallelism on OS X.
	+ self.parallelism_semaphores = \
	+ {k: multiprocessing.Semaphore(v) for k, v in
	+ self.lit_config.parallelism_groups.items()}

	def execute_test(self, test):
	return _execute_test_impl(test, self.lit_config,
	self.parallelism_semaphores)

	def execute_tests(self, display, jobs, max_time=None):
	"""
	execute_tests(display, jobs, [max_time])

	Execute each of the tests in the run, using up to jobs number of
	parallel tasks, and inform the display of each individual result. The
	provided tests should be a subset of the tests available in this run
	object.

	If max_time is non-None, it should be a time in seconds after which to
	stop executing tests.

	The display object will have its update method called with each test as
	it is completed. The calls are guaranteed to be locked with respect to
	one another, but are not guaranteed to be called on the same thread as
	this method was invoked on.

	Upon completion, each test in the run will have its result
	computed. Tests which were not actually executed (for any reason) will
	be given an UNRESOLVED result.
	"""
	# Don't do anything if we aren't going to run any tests.
	if not self.tests or jobs == 0:
	return

	- # Set up semaphores to limit parallelism of certain classes of tests.
	- # For example, some ASan tests require lots of virtual memory and run
	- # faster with less parallelism on OS X.
	- self.parallelism_semaphores = \
	- {k: multiprocessing.Semaphore(v) for k, v in
	- self.lit_config.parallelism_groups.items()}
	-
	# Install a console-control signal handler on Windows.
	if win32api is not None:
	def console_ctrl_handler(type):
	print('\nCtrl-C detected, terminating.')
	pool.terminate()
	pool.join()
	abort_now()
	return True
	win32api.SetConsoleCtrlHandler(console_ctrl_handler, True)

	# Save the display object on the runner so that we can update it from
	# our task completion callback.
	self.display = display

	# We need to issue many wait calls, so compute the final deadline and
	# subtract time.time() from that as we go along.
	deadline = None
	if max_time:
	deadline = time.time() + max_time

	# Start a process pool. Copy over the data shared between all test runs.
	# FIXME: Find a way to capture the worker process stderr. If the user
	# interrupts the workers before we make it into our task callback, they
	# will each raise a KeyboardInterrupt exception and print to stderr at
	# the same time.
	pool = multiprocessing.Pool(jobs, worker_initializer,
	(self.lit_config,
	self.parallelism_semaphores))

	try:
	self.failure_count = 0
	self.hit_max_failures = False
	async_results = [pool.apply_async(worker_run_one_test,
	args=(test_index, test),
	callback=self.consume_test_result)
	for test_index, test in enumerate(self.tests)]
	pool.close()

	# Wait for all results to come in. The callback that runs in the
	# parent process will update the display.
	for a in async_results:
	if deadline:
	a.wait(deadline - time.time())
	else:
	# Python condition variables cannot be interrupted unless
	# they have a timeout. This can make lit unresponsive to
	# KeyboardInterrupt, so do a busy wait with a timeout.
	while not a.ready():
	a.wait(1)
	if not a.successful():
	a.get() # Exceptions raised here come from the worker.
	if self.hit_max_failures:
	break
	except:
	# Stop the workers and wait for any straggling results to come in
	# if we exited without waiting on every async result.
	pool.terminate()
	raise
	finally:
	pool.join()

	# Mark any tests that weren't run as UNRESOLVED.
	for test in self.tests:
	if test.result is None:
	test.setResult(lit.Test.Result(lit.Test.UNRESOLVED, '', 0.0))

	def consume_test_result(self, pool_result):
	"""Test completion callback for worker_run_one_test

	Updates the test result status in the parent process. Each task in the
	pool returns the test index and the result, and we use the index to look
	up the original test object. Also updates the progress bar as tasks
	complete.
	"""
	# Don't add any more test results after we've hit the maximum failure
	# count. Otherwise we're racing with the main thread, which is going
	# to terminate the process pool soon.
	if self.hit_max_failures:
	return

	(test_index, test_with_result) = pool_result
	# Update the parent process copy of the test. This includes the result,
	# XFAILS, REQUIRES, and UNSUPPORTED statuses.
	assert self.tests[test_index].file_path == test_with_result.file_path, \
	"parent and child disagree on test path"
	self.tests[test_index] = test_with_result
	self.display.update(test_with_result)

	# If we've finished all the tests or too many tests have failed, notify
	# the main thread that we've stopped testing.
	self.failure_count += (test_with_result.result.code == lit.Test.FAIL)
	if self.lit_config.maxFailures and \
	self.failure_count == self.lit_config.maxFailures:
	self.hit_max_failures = True

	def _execute_test_impl(test, lit_config, parallelism_semaphores):
	"""Execute one test"""
	pg = test.config.parallelism_group
	if callable(pg):
	pg = pg(test)

	result = None
	semaphore = None
	try:
	if pg:
	semaphore = parallelism_semaphores[pg]
	if semaphore:
	semaphore.acquire()
	start_time = time.time()
	result = test.config.test_format.execute(test, lit_config)
	# Support deprecated result from execute() which returned the result
	# code and additional output as a tuple.
	if isinstance(result, tuple):
	code, output = result
	result = lit.Test.Result(code, output)
	elif not isinstance(result, lit.Test.Result):
	raise ValueError("unexpected result from test execution")
	result.elapsed = time.time() - start_time
	except KeyboardInterrupt:
	raise
	except:
	if lit_config.debug:
	raise
	output = 'Exception during script execution:\n'
	output += traceback.format_exc()
	output += '\n'
	result = lit.Test.Result(lit.Test.UNRESOLVED, output)
	finally:
	if semaphore:
	semaphore.release()

	test.setResult(result)

	child_lit_config = None
	child_parallelism_semaphores = None

	def worker_initializer(lit_config, parallelism_semaphores):
	"""Copy expensive repeated data into worker processes"""
	global child_lit_config
	child_lit_config = lit_config
	global child_parallelism_semaphores
	child_parallelism_semaphores = parallelism_semaphores

	def worker_run_one_test(test_index, test):
	"""Run one test in a multiprocessing.Pool

	Side effects in this function and functions it calls are not visible in the
	main lit process.

	Arguments and results of this function are pickled, so they should be cheap
	to copy. For efficiency, we copy all data needed to execute all tests into
	each worker and store it in the child_* global variables. This reduces the
	cost of each task.

	Returns an index and a Result, which the parent process uses to update
	the display.
	"""
	try:
	_execute_test_impl(test, child_lit_config, child_parallelism_semaphores)
	return (test_index, test)
	except KeyboardInterrupt as e:
	# If a worker process gets an interrupt, abort it immediately.
	abort_now()
	except:
	traceback.print_exc()
	diff --git a/utils/lit/tests/Inputs/max-failures/lit.cfg b/utils/lit/tests/Inputs/max-failures/lit.cfg
	new file mode 100644
	index 000000000000..50d07566e1cc
	--- /dev/null
	+++ b/utils/lit/tests/Inputs/max-failures/lit.cfg
	@@ -0,0 +1,6 @@
	+import lit.formats
	+config.name = 'shtest-shell'
	+config.suffixes = ['.txt']
	+config.test_format = lit.formats.ShTest()
	+config.test_source_root = os.path.dirname(__file__) + '/../shtest-shell'
	+config.test_exec_root = None
	diff --git a/utils/lit/tests/max-failures.py b/utils/lit/tests/max-failures.py
	index 5cc258dd08aa..bc58e9a4e47f 100644
	--- a/utils/lit/tests/max-failures.py
	+++ b/utils/lit/tests/max-failures.py
	@@ -1,14 +1,14 @@
	# Check the behavior of --max-failures option.
	#
	-# RUN: not %{lit} -j 1 -v %{inputs}/shtest-shell > %t.out
	-# RUN: not %{lit} --max-failures=1 -j 1 -v %{inputs}/shtest-shell >> %t.out
	-# RUN: not %{lit} --max-failures=2 -j 1 -v %{inputs}/shtest-shell >> %t.out
	-# RUN: not %{lit} --max-failures=0 -j 1 -v %{inputs}/shtest-shell 2>> %t.out
	+# RUN: not %{lit} -j 1 -v %{inputs}/max-failures > %t.out
	+# RUN: not %{lit} --max-failures=1 -j 1 -v %{inputs}/max-failures >> %t.out
	+# RUN: not %{lit} --max-failures=2 -j 1 -v %{inputs}/max-failures >> %t.out
	+# RUN: not %{lit} --max-failures=0 -j 1 -v %{inputs}/max-failures 2>> %t.out
	# RUN: FileCheck < %t.out %s
	#
	# END.

	# CHECK: Failing Tests (3)
	# CHECK: Failing Tests (1)
	# CHECK: Failing Tests (2)
	# CHECK: error: Setting --max-failures to 0 does not have any effect.
	diff --git a/utils/lit/tests/selecting.py b/utils/lit/tests/selecting.py
	index 19ba240f9b0f..4a0d08b860b8 100644
	--- a/utils/lit/tests/selecting.py
	+++ b/utils/lit/tests/selecting.py
	@@ -1,95 +1,95 @@
	# RUN: %{lit} %{inputs}/discovery \| FileCheck --check-prefix=CHECK-BASIC %s
	# CHECK-BASIC: Testing: 5 tests


	# Check that regex-filtering works
	#
	# RUN: %{lit} --filter 'o[a-z]e' %{inputs}/discovery \| FileCheck --check-prefix=CHECK-FILTER %s
	# CHECK-FILTER: Testing: 2 of 5 tests

	# Check that regex-filtering based on environment variables work.
	#
	-# RUN: LIT_FILTER='o[a-z]e' %{lit} %{inputs}/discovery \| FileCheck --check-prefix=CHECK-FILTER-ENV %s
	+# RUN: env LIT_FILTER='o[a-z]e' %{lit} %{inputs}/discovery \| FileCheck --check-prefix=CHECK-FILTER-ENV %s
	# CHECK-FILTER-ENV: Testing: 2 of 5 tests


	# Check that maximum counts work
	#
	# RUN: %{lit} --max-tests 3 %{inputs}/discovery \| FileCheck --check-prefix=CHECK-MAX %s
	# CHECK-MAX: Testing: 3 of 5 tests


	# Check that sharding partitions the testsuite in a way that distributes the
	# rounding error nicely (i.e. 5/3 => 2 2 1, not 1 1 3 or whatever)
	#
	# RUN: %{lit} --num-shards 3 --run-shard 1 %{inputs}/discovery >%t.out 2>%t.err
	# RUN: FileCheck --check-prefix=CHECK-SHARD0-ERR < %t.err %s
	# RUN: FileCheck --check-prefix=CHECK-SHARD0-OUT < %t.out %s
	# CHECK-SHARD0-ERR: note: Selecting shard 1/3 = size 2/5 = tests #(3*k)+1 = [1, 4]
	# CHECK-SHARD0-OUT: Testing: 2 of 5 tests
	#
	# RUN: %{lit} --num-shards 3 --run-shard 2 %{inputs}/discovery >%t.out 2>%t.err
	# RUN: FileCheck --check-prefix=CHECK-SHARD1-ERR < %t.err %s
	# RUN: FileCheck --check-prefix=CHECK-SHARD1-OUT < %t.out %s
	# CHECK-SHARD1-ERR: note: Selecting shard 2/3 = size 2/5 = tests #(3*k)+2 = [2, 5]
	# CHECK-SHARD1-OUT: Testing: 2 of 5 tests
	#
	# RUN: %{lit} --num-shards 3 --run-shard 3 %{inputs}/discovery >%t.out 2>%t.err
	# RUN: FileCheck --check-prefix=CHECK-SHARD2-ERR < %t.err %s
	# RUN: FileCheck --check-prefix=CHECK-SHARD2-OUT < %t.out %s
	# CHECK-SHARD2-ERR: note: Selecting shard 3/3 = size 1/5 = tests #(3*k)+3 = [3]
	# CHECK-SHARD2-OUT: Testing: 1 of 5 tests


	# Check that sharding via env vars works.
	#
	# RUN: env LIT_NUM_SHARDS=3 LIT_RUN_SHARD=1 %{lit} %{inputs}/discovery >%t.out 2>%t.err
	# RUN: FileCheck --check-prefix=CHECK-SHARD0-ENV-ERR < %t.err %s
	# RUN: FileCheck --check-prefix=CHECK-SHARD0-ENV-OUT < %t.out %s
	# CHECK-SHARD0-ENV-ERR: note: Selecting shard 1/3 = size 2/5 = tests #(3*k)+1 = [1, 4]
	# CHECK-SHARD0-ENV-OUT: Testing: 2 of 5 tests
	#
	# RUN: env LIT_NUM_SHARDS=3 LIT_RUN_SHARD=2 %{lit} %{inputs}/discovery >%t.out 2>%t.err
	# RUN: FileCheck --check-prefix=CHECK-SHARD1-ENV-ERR < %t.err %s
	# RUN: FileCheck --check-prefix=CHECK-SHARD1-ENV-OUT < %t.out %s
	# CHECK-SHARD1-ENV-ERR: note: Selecting shard 2/3 = size 2/5 = tests #(3*k)+2 = [2, 5]
	# CHECK-SHARD1-ENV-OUT: Testing: 2 of 5 tests
	#
	# RUN: env LIT_NUM_SHARDS=3 LIT_RUN_SHARD=3 %{lit} %{inputs}/discovery >%t.out 2>%t.err
	# RUN: FileCheck --check-prefix=CHECK-SHARD2-ENV-ERR < %t.err %s
	# RUN: FileCheck --check-prefix=CHECK-SHARD2-ENV-OUT < %t.out %s
	# CHECK-SHARD2-ENV-ERR: note: Selecting shard 3/3 = size 1/5 = tests #(3*k)+3 = [3]
	# CHECK-SHARD2-ENV-OUT: Testing: 1 of 5 tests


	# Check that providing more shards than tests results in 1 test per shard
	# until we run out, then 0.
	#
	# RUN: %{lit} --num-shards 100 --run-shard 2 %{inputs}/discovery >%t.out 2>%t.err
	# RUN: FileCheck --check-prefix=CHECK-SHARD-BIG-ERR1 < %t.err %s
	# RUN: FileCheck --check-prefix=CHECK-SHARD-BIG-OUT1 < %t.out %s
	# CHECK-SHARD-BIG-ERR1: note: Selecting shard 2/100 = size 1/5 = tests #(100*k)+2 = [2]
	# CHECK-SHARD-BIG-OUT1: Testing: 1 of 5 tests
	#
	# RUN: %{lit} --num-shards 100 --run-shard 6 %{inputs}/discovery >%t.out 2>%t.err
	# RUN: FileCheck --check-prefix=CHECK-SHARD-BIG-ERR2 < %t.err %s
	# RUN: FileCheck --check-prefix=CHECK-SHARD-BIG-OUT2 < %t.out %s
	# CHECK-SHARD-BIG-ERR2: note: Selecting shard 6/100 = size 0/5 = tests #(100*k)+6 = []
	# CHECK-SHARD-BIG-OUT2: Testing: 0 of 5 tests
	#
	# RUN: %{lit} --num-shards 100 --run-shard 50 %{inputs}/discovery >%t.out 2>%t.err
	# RUN: FileCheck --check-prefix=CHECK-SHARD-BIG-ERR3 < %t.err %s
	# RUN: FileCheck --check-prefix=CHECK-SHARD-BIG-OUT3 < %t.out %s
	# CHECK-SHARD-BIG-ERR3: note: Selecting shard 50/100 = size 0/5 = tests #(100*k)+50 = []
	# CHECK-SHARD-BIG-OUT3: Testing: 0 of 5 tests


	# Check that range constraints are enforced
	#
	# RUN: not %{lit} --num-shards 0 --run-shard 2 %{inputs}/discovery >%t.out 2>%t.err
	# RUN: FileCheck --check-prefix=CHECK-SHARD-ERR < %t.err %s
	# CHECK-SHARD-ERR: error: --num-shards must be positive
	#
	# RUN: not %{lit} --num-shards 3 --run-shard 4 %{inputs}/discovery >%t.out 2>%t.err
	# RUN: FileCheck --check-prefix=CHECK-SHARD-ERR2 < %t.err %s
	# CHECK-SHARD-ERR2: error: --run-shard must be between 1 and --num-shards (inclusive)
	diff --git a/utils/release/test-release.sh b/utils/release/test-release.sh
	index 02d8e7925f6e..66a2c578083e 100755
	--- a/utils/release/test-release.sh
	+++ b/utils/release/test-release.sh
	@@ -1,596 +1,601 @@
	#!/usr/bin/env bash
	#===-- test-release.sh - Test the LLVM release candidates ------------------===#
	#
	# The LLVM Compiler Infrastructure
	#
	# This file is distributed under the University of Illinois Open Source
	# License.
	#
	#===------------------------------------------------------------------------===#
	#
	# Download, build, and test the release candidate for an LLVM release.
	#
	#===------------------------------------------------------------------------===#

	System=`uname -s`
	if [ "$System" = "FreeBSD" ]; then
	MAKE=gmake
	else
	MAKE=make
	fi

	# Base SVN URL for the sources.
	Base_url="http://llvm.org/svn/llvm-project"

	Release=""
	Release_no_dot=""
	RC=""
	Triple=""
	use_gzip="no"
	do_checkout="yes"
	do_debug="no"
	do_asserts="no"
	do_compare="yes"
	do_rt="yes"
	do_libs="yes"
	do_libunwind="yes"
	do_test_suite="yes"
	do_openmp="yes"
	do_lld="yes"
	do_lldb="no"
	do_polly="yes"
	BuildDir="`pwd`"
	ExtraConfigureFlags=""
	ExportBranch=""

	function usage() {
	echo "usage: `basename $0` -release X.Y.Z -rc NUM [OPTIONS]"
	echo ""
	echo " -release X.Y.Z The release version to test."
	echo " -rc NUM The pre-release candidate number."
	echo " -final The final release candidate."
	echo " -triple TRIPLE The target triple for this machine."
	echo " -j NUM Number of compile jobs to run. [default: 3]"
	echo " -build-dir DIR Directory to perform testing in. [default: pwd]"
	echo " -no-checkout Don't checkout the sources from SVN."
	echo " -test-debug Test the debug build. [default: no]"
	echo " -test-asserts Test with asserts on. [default: no]"
	echo " -no-compare-files Don't test that phase 2 and 3 files are identical."
	echo " -use-gzip Use gzip instead of xz."
	echo " -configure-flags FLAGS Extra flags to pass to the configure step."
	echo " -svn-path DIR Use the specified DIR instead of a release."
	echo " For example -svn-path trunk or -svn-path branches/release_37"
	echo " -no-rt Disable check-out & build Compiler-RT"
	echo " -no-libs Disable check-out & build libcxx/libcxxabi/libunwind"
	echo " -no-libunwind Disable check-out & build libunwind"
	echo " -no-test-suite Disable check-out & build test-suite"
	echo " -no-openmp Disable check-out & build libomp"
	echo " -no-lld Disable check-out & build lld"
	echo " -lldb Enable check-out & build lldb"
	echo " -no-lldb Disable check-out & build lldb (default)"
	echo " -no-polly Disable check-out & build Polly"
	}

	while [ $# -gt 0 ]; do
	case $1 in
	-release \| --release )
	shift
	Release="$1"
	Release_no_dot="`echo $1 \| sed -e 's,\.,,g'`"
	;;
	-rc \| --rc \| -RC \| --RC )
	shift
	RC="rc$1"
	;;
	-final \| --final )
	RC=final
	;;
	-svn-path \| --svn-path )
	shift
	Release="test"
	Release_no_dot="test"
	ExportBranch="$1"
	RC="`echo $ExportBranch \| sed -e 's,/,_,g'`"
	echo "WARNING: Using the branch $ExportBranch instead of a release tag"
	echo " This is intended to aid new packagers in trialing "
	echo " builds without requiring a tag to be created first"
	;;
	-triple \| --triple )
	shift
	Triple="$1"
	;;
	-configure-flags \| --configure-flags )
	shift
	ExtraConfigureFlags="$1"
	;;
	-j* )
	NumJobs="`echo $1 \| sed -e 's,-j$[0-9]*$,\1,g'`"
	if [ -z "$NumJobs" ]; then
	shift
	NumJobs="$1"
	fi
	;;
	-build-dir \| --build-dir \| -builddir \| --builddir )
	shift
	BuildDir="$1"
	;;
	-no-checkout \| --no-checkout )
	do_checkout="no"
	;;
	-test-debug \| --test-debug )
	do_debug="yes"
	;;
	-test-asserts \| --test-asserts )
	do_asserts="yes"
	;;
	-no-compare-files \| --no-compare-files )
	do_compare="no"
	;;
	-use-gzip \| --use-gzip )
	use_gzip="yes"
	;;
	-no-rt )
	do_rt="no"
	;;
	-no-libs )
	do_libs="no"
	;;
	-no-libunwind )
	do_libunwind="no"
	;;
	-no-test-suite )
	do_test_suite="no"
	;;
	-no-openmp )
	do_openmp="no"
	;;
	-no-lld )
	do_lld="no"
	;;
	-lldb )
	do_lldb="yes"
	;;
	-no-lldb )
	do_lldb="no"
	;;
	-no-polly )
	do_polly="no"
	;;
	-help \| --help \| -h \| --h \| -\? )
	usage
	exit 0
	;;
	* )
	echo "unknown option: $1"
	usage
	exit 1
	;;
	esac
	shift
	done

	# Check required arguments.
	if [ -z "$Release" ]; then
	echo "error: no release number specified"
	exit 1
	fi
	if [ -z "$RC" ]; then
	echo "error: no release candidate number specified"
	exit 1
	fi
	if [ -z "$ExportBranch" ]; then
	ExportBranch="tags/RELEASE_$Release_no_dot/$RC"
	fi
	if [ -z "$Triple" ]; then
	echo "error: no target triple specified"
	exit 1
	fi

	# Figure out how many make processes to run.
	if [ -z "$NumJobs" ]; then
	NumJobs=`sysctl -n hw.activecpu 2> /dev/null \|\| true`
	fi
	if [ -z "$NumJobs" ]; then
	NumJobs=`sysctl -n hw.ncpu 2> /dev/null \|\| true`
	fi
	if [ -z "$NumJobs" ]; then
	NumJobs=`grep -c processor /proc/cpuinfo 2> /dev/null \|\| true`
	fi
	if [ -z "$NumJobs" ]; then
	NumJobs=3
	fi

	# Projects list
	projects="llvm cfe clang-tools-extra"
	if [ $do_rt = "yes" ]; then
	projects="$projects compiler-rt"
	fi
	if [ $do_libs = "yes" ]; then
	projects="$projects libcxx libcxxabi"
	if [ $do_libunwind = "yes" ]; then
	projects="$projects libunwind"
	fi
	fi
	case $do_test_suite in
	yes\|export-only)
	projects="$projects test-suite"
	;;
	esac
	if [ $do_openmp = "yes" ]; then
	projects="$projects openmp"
	fi
	if [ $do_lld = "yes" ]; then
	projects="$projects lld"
	fi
	if [ $do_lldb = "yes" ]; then
	projects="$projects lldb"
	fi
	if [ $do_polly = "yes" ]; then
	projects="$projects polly"
	fi

	# Go to the build directory (may be different from CWD)
	BuildDir=$BuildDir/$RC
	mkdir -p $BuildDir
	cd $BuildDir

	# Location of log files.
	LogDir=$BuildDir/logs
	mkdir -p $LogDir

	# Final package name.
	Package=clang+llvm-$Release
	if [ $RC != "final" ]; then
	Package=$Package-$RC
	fi
	Package=$Package-$Triple

	# Errors to be highlighted at the end are written to this file.
	echo -n > $LogDir/deferred_errors.log

	function deferred_error() {
	Phase="$1"
	Flavor="$2"
	Msg="$3"
	echo "[${Flavor} Phase${Phase}] ${Msg}" \| tee -a $LogDir/deferred_errors.log
	}

	# Make sure that a required program is available
	function check_program_exists() {
	local program="$1"
	if ! type -P $program > /dev/null 2>&1 ; then
	echo "program '$1' not found !"
	exit 1
	fi
	}

	if [ "$System" != "Darwin" ]; then
	check_program_exists 'chrpath'
	check_program_exists 'file'
	check_program_exists 'objdump'
	fi

	# Make sure that the URLs are valid.
	function check_valid_urls() {
	for proj in $projects ; do
	echo "# Validating $proj SVN URL"

	if ! svn ls $Base_url/$proj/$ExportBranch > /dev/null 2>&1 ; then
	echo "$proj does not have a $ExportBranch branch/tag!"
	exit 1
	fi
	done
	}

	# Export sources to the build directory.
	function export_sources() {
	check_valid_urls

	for proj in $projects ; do
	case $proj in
	llvm)
	projsrc=$proj.src
	;;
	cfe)
	projsrc=llvm.src/tools/clang
	;;
	lld\|lldb\|polly)
	projsrc=llvm.src/tools/$proj
	;;
	clang-tools-extra)
	projsrc=llvm.src/tools/clang/tools/extra
	;;
	compiler-rt\|libcxx\|libcxxabi\|libunwind\|openmp)
	projsrc=llvm.src/projects/$proj
	;;
	test-suite)
	projsrc=$proj.src
	;;
	*)
	echo "error: unknown project $proj"
	exit 1
	;;
	esac

	if [ -d $projsrc ]; then
	echo "# Reusing $proj $Release-$RC sources in $projsrc"
	continue
	fi
	echo "# Exporting $proj $Release-$RC sources to $projsrc"
	if ! svn export -q $Base_url/$proj/$ExportBranch $projsrc ; then
	echo "error: failed to export $proj project"
	exit 1
	fi
	done

	cd $BuildDir
	}

	function configure_llvmCore() {
	Phase="$1"
	Flavor="$2"
	ObjDir="$3"

	case $Flavor in
	Release )
	BuildType="Release"
	Assertions="OFF"
	;;
	Release+Asserts )
	BuildType="Release"
	Assertions="ON"
	;;
	Debug )
	BuildType="Debug"
	Assertions="ON"
	;;
	* )
	echo "# Invalid flavor '$Flavor'"
	echo ""
	return
	;;
	esac

	echo "# Using C compiler: $c_compiler"
	echo "# Using C++ compiler: $cxx_compiler"

	cd $ObjDir
	echo "# Configuring llvm $Release-$RC $Flavor"

	echo "#" env CC="$c_compiler" CXX="$cxx_compiler" \
	cmake -G "Unix Makefiles" \
	-DCMAKE_BUILD_TYPE=$BuildType -DLLVM_ENABLE_ASSERTIONS=$Assertions \
	$ExtraConfigureFlags $BuildDir/llvm.src \
	2>&1 \| tee $LogDir/llvm.configure-Phase$Phase-$Flavor.log
	env CC="$c_compiler" CXX="$cxx_compiler" \
	cmake -G "Unix Makefiles" \
	-DCMAKE_BUILD_TYPE=$BuildType -DLLVM_ENABLE_ASSERTIONS=$Assertions \
	$ExtraConfigureFlags $BuildDir/llvm.src \
	2>&1 \| tee $LogDir/llvm.configure-Phase$Phase-$Flavor.log

	cd $BuildDir
	}

	function build_llvmCore() {
	Phase="$1"
	Flavor="$2"
	ObjDir="$3"
	DestDir="$4"

	cd $ObjDir
	echo "# Compiling llvm $Release-$RC $Flavor"
	echo "# ${MAKE} -j $NumJobs VERBOSE=1"
	${MAKE} -j $NumJobs VERBOSE=1 \
	2>&1 \| tee $LogDir/llvm.make-Phase$Phase-$Flavor.log

	echo "# Installing llvm $Release-$RC $Flavor"
	echo "# ${MAKE} install"
	${MAKE} install \
	DESTDIR="${DestDir}" \
	2>&1 \| tee $LogDir/llvm.install-Phase$Phase-$Flavor.log
	cd $BuildDir
	}

	function test_llvmCore() {
	Phase="$1"
	Flavor="$2"
	ObjDir="$3"

	cd $ObjDir
	if ! ( ${MAKE} -j $NumJobs -k check-all \
	2>&1 \| tee $LogDir/llvm.check-Phase$Phase-$Flavor.log ) ; then
	deferred_error $Phase $Flavor "check-all failed"
	fi

	if [ $do_test_suite = 'yes' ]; then
	- SandboxDir="$BuildDir/sandbox"
	- Lit=$SandboxDir/bin/lit
	- TestSuiteBuildDir="$BuildDir/test-suite-build"
	- TestSuiteSrcDir="$BuildDir/test-suite.src"
	-
	- virtualenv $SandboxDir
	- $SandboxDir/bin/python $BuildDir/llvm.src/utils/lit/setup.py install
	- mkdir -p $TestSuiteBuildDir
	cd $TestSuiteBuildDir
	env CC="$c_compiler" CXX="$cxx_compiler" \
	cmake $TestSuiteSrcDir -DTEST_SUITE_LIT=$Lit
	if ! ( ${MAKE} -j $NumJobs -k check \
	2>&1 \| tee $LogDir/llvm.check-Phase$Phase-$Flavor.log ) ; then
	deferred_error $Phase $Flavor "test suite failed"
	fi
	fi
	cd $BuildDir
	}

	# Clean RPATH. Libtool adds the build directory to the search path, which is
	# not necessary --- and even harmful --- for the binary packages we release.
	function clean_RPATH() {
	if [ "$System" = "Darwin" ]; then
	return
	fi
	local InstallPath="$1"
	for Candidate in `find $InstallPath/{bin,lib} -type f`; do
	if file $Candidate \| grep ELF \| egrep 'executable\|shared object' > /dev/null 2>&1 ; then
	if rpath=`objdump -x $Candidate \| grep 'RPATH'` ; then
	rpath=`echo $rpath \| sed -e's/^ RPATH //'`
	if [ -n "$rpath" ]; then
	newrpath=`echo $rpath \| sed -e's/.$\$ORIGIN[^:]$.*/\1/'`
	chrpath -r $newrpath $Candidate 2>&1 > /dev/null 2>&1
	fi
	fi
	fi
	done
	}

	# Create a package of the release binaries.
	function package_release() {
	cwd=`pwd`
	cd $BuildDir/Phase3/Release
	mv llvmCore-$Release-$RC.install/usr/local $Package
	if [ "$use_gzip" = "yes" ]; then
	tar cfz $BuildDir/$Package.tar.gz $Package
	else
	tar cfJ $BuildDir/$Package.tar.xz $Package
	fi
	mv $Package llvmCore-$Release-$RC.install/usr/local
	cd $cwd
	}

	# Exit if any command fails
	# Note: pipefail is necessary for running build commands through
	# a pipe (i.e. it changes the output of ``false \| tee /dev/null ; echo $?``)
	set -e
	set -o pipefail

	if [ "$do_checkout" = "yes" ]; then
	export_sources
	fi

	+# Setup the test-suite. Do this early so we can catch failures before
	+# we do the full 3 stage build.
	+if [ $do_test_suite = "yes" ]; then
	+ SandboxDir="$BuildDir/sandbox"
	+ Lit=$SandboxDir/bin/lit
	+ TestSuiteBuildDir="$BuildDir/test-suite-build"
	+ TestSuiteSrcDir="$BuildDir/test-suite.src"
	+
	+ virtualenv $SandboxDir
	+ $SandboxDir/bin/python $BuildDir/llvm.src/utils/lit/setup.py install
	+ mkdir -p $TestSuiteBuildDir
	+fi
	+
	(
	Flavors="Release"
	if [ "$do_debug" = "yes" ]; then
	Flavors="Debug $Flavors"
	fi
	if [ "$do_asserts" = "yes" ]; then
	Flavors="$Flavors Release+Asserts"
	fi

	for Flavor in $Flavors ; do
	echo ""
	echo ""
	echo "********************************************************************************"
	echo " Release: $Release-$RC"
	echo " Build: $Flavor"
	echo " System Info: "
	echo " `uname -a`"
	echo "********************************************************************************"
	echo ""

	c_compiler="$CC"
	cxx_compiler="$CXX"
	llvmCore_phase1_objdir=$BuildDir/Phase1/$Flavor/llvmCore-$Release-$RC.obj
	llvmCore_phase1_destdir=$BuildDir/Phase1/$Flavor/llvmCore-$Release-$RC.install

	llvmCore_phase2_objdir=$BuildDir/Phase2/$Flavor/llvmCore-$Release-$RC.obj
	llvmCore_phase2_destdir=$BuildDir/Phase2/$Flavor/llvmCore-$Release-$RC.install

	llvmCore_phase3_objdir=$BuildDir/Phase3/$Flavor/llvmCore-$Release-$RC.obj
	llvmCore_phase3_destdir=$BuildDir/Phase3/$Flavor/llvmCore-$Release-$RC.install

	rm -rf $llvmCore_phase1_objdir
	rm -rf $llvmCore_phase1_destdir

	rm -rf $llvmCore_phase2_objdir
	rm -rf $llvmCore_phase2_destdir

	rm -rf $llvmCore_phase3_objdir
	rm -rf $llvmCore_phase3_destdir

	mkdir -p $llvmCore_phase1_objdir
	mkdir -p $llvmCore_phase1_destdir

	mkdir -p $llvmCore_phase2_objdir
	mkdir -p $llvmCore_phase2_destdir

	mkdir -p $llvmCore_phase3_objdir
	mkdir -p $llvmCore_phase3_destdir

	############################################################################
	# Phase 1: Build llvmCore and clang
	echo "# Phase 1: Building llvmCore"
	configure_llvmCore 1 $Flavor $llvmCore_phase1_objdir
	build_llvmCore 1 $Flavor \
	$llvmCore_phase1_objdir $llvmCore_phase1_destdir
	clean_RPATH $llvmCore_phase1_destdir/usr/local

	########################################################################
	# Phase 2: Build llvmCore with newly built clang from phase 1.
	c_compiler=$llvmCore_phase1_destdir/usr/local/bin/clang
	cxx_compiler=$llvmCore_phase1_destdir/usr/local/bin/clang++
	echo "# Phase 2: Building llvmCore"
	configure_llvmCore 2 $Flavor $llvmCore_phase2_objdir
	build_llvmCore 2 $Flavor \
	$llvmCore_phase2_objdir $llvmCore_phase2_destdir
	clean_RPATH $llvmCore_phase2_destdir/usr/local

	########################################################################
	# Phase 3: Build llvmCore with newly built clang from phase 2.
	c_compiler=$llvmCore_phase2_destdir/usr/local/bin/clang
	cxx_compiler=$llvmCore_phase2_destdir/usr/local/bin/clang++
	echo "# Phase 3: Building llvmCore"
	configure_llvmCore 3 $Flavor $llvmCore_phase3_objdir
	build_llvmCore 3 $Flavor \
	$llvmCore_phase3_objdir $llvmCore_phase3_destdir
	clean_RPATH $llvmCore_phase3_destdir/usr/local

	########################################################################
	# Testing: Test phase 3
	c_compiler=$llvmCore_phase3_destdir/usr/local/bin/clang
	cxx_compiler=$llvmCore_phase3_destdir/usr/local/bin/clang++
	echo "# Testing - built with clang"
	test_llvmCore 3 $Flavor $llvmCore_phase3_objdir

	########################################################################
	# Compare .o files between Phase2 and Phase3 and report which ones
	# differ.
	if [ "$do_compare" = "yes" ]; then
	echo
	echo "# Comparing Phase 2 and Phase 3 files"
	for p2 in `find $llvmCore_phase2_objdir -name '*.o'` ; do
	p3=`echo $p2 \| sed -e 's,Phase2,Phase3,'`
	# Substitute 'Phase2' for 'Phase3' in the Phase 2 object file in
	# case there are build paths in the debug info. On some systems,
	# sed adds a newline to the output, so pass $p3 through sed too.
	if ! cmp -s \
	<(env LC_CTYPE=C sed -e 's,Phase2,Phase3,g' -e 's,Phase1,Phase2,g' $p2) \
	<(env LC_CTYPE=C sed -e '' $p3) 16 16; then
	echo "file `basename $p2` differs between phase 2 and phase 3"
	fi
	done
	fi
	done

	) 2>&1 \| tee $LogDir/testing.$Release-$RC.log

	if [ "$use_gzip" = "yes" ]; then
	echo "# Packaging the release as $Package.tar.gz"
	else
	echo "# Packaging the release as $Package.tar.xz"
	fi
	package_release

	set +e

	# Woo hoo!
	echo "### Testing Finished ###"
	echo "### Logs: $LogDir"

	echo "### Errors:"
	if [ -s "$LogDir/deferred_errors.log" ]; then
	cat "$LogDir/deferred_errors.log"
	exit 1
	else
	echo "None."
	fi

	exit 0

File Metadata

Mime Type: application/octet-stream
Expires: Tue, Jul 2, 9:32 PM (2 d)
Storage Engine: chunks
Storage Format: Chunks
Storage Handle: 4aFDxapF6fgz
Default Alt Text: (5 MB)

Offset	End	Complete
0	4194304	Yes
4194304	5540829	Yes

No OneTemporaryActions

View Options

File Metadata

Event Timeline

No OneTemporary
Actions